Skip to content

vllm.entrypoints.openai.translations.serving

logger module-attribute

logger = init_logger(__name__)

EngineClient

Bases: ABC

Protocol class for Clients to Engine

Source code in vllm/engine/protocol.py
class EngineClient(ABC):
    """Protocol class for Clients to Engine"""

    vllm_config: VllmConfig
    model_config: ModelConfig
    input_processor: InputProcessor
    io_processor: IOProcessor | None

    @property
    @abstractmethod
    def renderer(self) -> BaseRenderer: ...

    @property
    @abstractmethod
    def is_running(self) -> bool: ...

    @property
    @abstractmethod
    def is_stopped(self) -> bool: ...

    @property
    @abstractmethod
    def errored(self) -> bool: ...

    @property
    @abstractmethod
    def dead_error(self) -> BaseException: ...

    @abstractmethod
    def generate(
        self,
        prompt: EngineCoreRequest
        | PromptType
        | DictPrompt
        | TokPrompt
        | AsyncGenerator[StreamingInput, None],
        sampling_params: SamplingParams,
        request_id: str,
        *,
        prompt_text: str | None = None,
        lora_request: LoRARequest | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        data_parallel_rank: int | None = None,
    ) -> AsyncGenerator[RequestOutput, None]:
        """Generate outputs for a request."""
        ...

    @abstractmethod
    def encode(
        self,
        prompt: PromptType | DictPrompt | TokPrompt,
        pooling_params: PoolingParams,
        request_id: str,
        lora_request: LoRARequest | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        tokenization_kwargs: dict[str, Any] | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        """Generate outputs for a request from a pooling model."""
        ...

    @abstractmethod
    async def abort(self, request_id: str | Iterable[str]) -> None:
        """Abort a request.

        Args:
            request_id: The unique id of the request,
                        or an iterable of such ids.
        """
        ...

    @abstractmethod
    async def is_tracing_enabled(self) -> bool: ...

    @abstractmethod
    async def do_log_stats(self) -> None: ...

    @abstractmethod
    async def check_health(self) -> None:
        """Raise if unhealthy"""
        ...

    @abstractmethod
    async def start_profile(self) -> None:
        """Start profiling the engine"""
        ...

    @abstractmethod
    async def stop_profile(self) -> None:
        """Stop profiling the engine"""
        ...

    @abstractmethod
    async def reset_mm_cache(self) -> None:
        """Reset the multi-modal cache"""
        ...

    @abstractmethod
    async def reset_encoder_cache(self) -> None:
        """Reset the encoder cache"""
        ...

    @abstractmethod
    async def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        """Reset the prefix cache and optionally any configured connector cache"""
        ...

    @abstractmethod
    async def sleep(self, level: int = 1) -> None:
        """Sleep the engine"""
        ...

    @abstractmethod
    async def wake_up(self, tags: list[str] | None = None) -> None:
        """Wake up the engine"""
        ...

    @abstractmethod
    async def is_sleeping(self) -> bool:
        """Check whether the engine is sleeping"""
        ...

    @abstractmethod
    async def add_lora(self, lora_request: LoRARequest) -> bool:
        """Load a new LoRA adapter into the engine for future requests."""
        ...

    @abstractmethod
    async def pause_generation(
        self,
        *,
        mode: "PauseMode" = "abort",
        wait_for_inflight_requests: bool = False,
        clear_cache: bool = True,
    ) -> None:
        """Pause new generation/encoding requests.

        Args:
            mode: How to handle in-flight requests:
                - ``"abort"``: Abort all in-flight requests immediately
                  and return partial results with "abort" reason (default).
                - ``"wait"``: Wait for in-flight requests to complete.
                - ``"keep"``: Freeze requests in queue; they resume on
                  :meth:`resume_generation`.
            wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
            clear_cache: DEPRECATED. Whether to clear KV and prefix caches
                after draining.
        """
        ...

    @abstractmethod
    async def resume_generation(self) -> None:
        """Resume accepting generation/encoding requests."""
        ...

    @abstractmethod
    async def is_paused(self) -> bool:
        """Return whether the engine is currently paused."""
        ...

    async def scale_elastic_ep(
        self, new_data_parallel_size: int, drain_timeout: int = 300
    ) -> None:
        """Scale the engine"""
        raise NotImplementedError

    async def collective_rpc(
        self,
        method: str,
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict | None = None,
    ):
        """Perform a collective RPC call to the given path."""
        raise NotImplementedError

    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        """Get supported tasks"""
        raise NotImplementedError

    async def init_weight_transfer_engine(
        self, init_request: WeightTransferInitRequest
    ) -> None:
        """Initialize weight transfer for RL training."""
        raise NotImplementedError

    async def update_weights(self, request: WeightTransferUpdateRequest) -> None:
        """Batched weight update for RL training."""
        raise NotImplementedError

dead_error abstractmethod property

dead_error: BaseException

errored abstractmethod property

errored: bool

input_processor instance-attribute

input_processor: InputProcessor

io_processor instance-attribute

io_processor: IOProcessor | None

is_running abstractmethod property

is_running: bool

is_stopped abstractmethod property

is_stopped: bool

model_config instance-attribute

model_config: ModelConfig

renderer abstractmethod property

renderer: BaseRenderer

vllm_config instance-attribute

vllm_config: VllmConfig

abort abstractmethod async

abort(request_id: str | Iterable[str]) -> None

Abort a request.

Parameters:

Name Type Description Default
request_id str | Iterable[str]

The unique id of the request, or an iterable of such ids.

required
Source code in vllm/engine/protocol.py
@abstractmethod
async def abort(self, request_id: str | Iterable[str]) -> None:
    """Abort a request.

    Args:
        request_id: The unique id of the request,
                    or an iterable of such ids.
    """
    ...

add_lora abstractmethod async

add_lora(lora_request: LoRARequest) -> bool

Load a new LoRA adapter into the engine for future requests.

Source code in vllm/engine/protocol.py
@abstractmethod
async def add_lora(self, lora_request: LoRARequest) -> bool:
    """Load a new LoRA adapter into the engine for future requests."""
    ...

check_health abstractmethod async

check_health() -> None

Raise if unhealthy

Source code in vllm/engine/protocol.py
@abstractmethod
async def check_health(self) -> None:
    """Raise if unhealthy"""
    ...

collective_rpc async

collective_rpc(
    method: str,
    timeout: float | None = None,
    args: tuple = (),
    kwargs: dict | None = None,
)

Perform a collective RPC call to the given path.

Source code in vllm/engine/protocol.py
async def collective_rpc(
    self,
    method: str,
    timeout: float | None = None,
    args: tuple = (),
    kwargs: dict | None = None,
):
    """Perform a collective RPC call to the given path."""
    raise NotImplementedError

do_log_stats abstractmethod async

do_log_stats() -> None
Source code in vllm/engine/protocol.py
@abstractmethod
async def do_log_stats(self) -> None: ...

encode abstractmethod

encode(
    prompt: PromptType | DictPrompt | TokPrompt,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: LoRARequest | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]

Generate outputs for a request from a pooling model.

Source code in vllm/engine/protocol.py
@abstractmethod
def encode(
    self,
    prompt: PromptType | DictPrompt | TokPrompt,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: LoRARequest | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
    """Generate outputs for a request from a pooling model."""
    ...

generate abstractmethod

generate(
    prompt: EngineCoreRequest
    | PromptType
    | DictPrompt
    | TokPrompt
    | AsyncGenerator[StreamingInput, None],
    sampling_params: SamplingParams,
    request_id: str,
    *,
    prompt_text: str | None = None,
    lora_request: LoRARequest | None = None,
    tokenization_kwargs: dict[str, Any] | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    data_parallel_rank: int | None = None,
) -> AsyncGenerator[RequestOutput, None]

Generate outputs for a request.

Source code in vllm/engine/protocol.py
@abstractmethod
def generate(
    self,
    prompt: EngineCoreRequest
    | PromptType
    | DictPrompt
    | TokPrompt
    | AsyncGenerator[StreamingInput, None],
    sampling_params: SamplingParams,
    request_id: str,
    *,
    prompt_text: str | None = None,
    lora_request: LoRARequest | None = None,
    tokenization_kwargs: dict[str, Any] | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    data_parallel_rank: int | None = None,
) -> AsyncGenerator[RequestOutput, None]:
    """Generate outputs for a request."""
    ...

get_supported_tasks async

get_supported_tasks() -> tuple[SupportedTask, ...]

Get supported tasks

Source code in vllm/engine/protocol.py
async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
    """Get supported tasks"""
    raise NotImplementedError

init_weight_transfer_engine async

init_weight_transfer_engine(
    init_request: WeightTransferInitRequest,
) -> None

Initialize weight transfer for RL training.

Source code in vllm/engine/protocol.py
async def init_weight_transfer_engine(
    self, init_request: WeightTransferInitRequest
) -> None:
    """Initialize weight transfer for RL training."""
    raise NotImplementedError

is_paused abstractmethod async

is_paused() -> bool

Return whether the engine is currently paused.

Source code in vllm/engine/protocol.py
@abstractmethod
async def is_paused(self) -> bool:
    """Return whether the engine is currently paused."""
    ...

is_sleeping abstractmethod async

is_sleeping() -> bool

Check whether the engine is sleeping

Source code in vllm/engine/protocol.py
@abstractmethod
async def is_sleeping(self) -> bool:
    """Check whether the engine is sleeping"""
    ...

is_tracing_enabled abstractmethod async

is_tracing_enabled() -> bool
Source code in vllm/engine/protocol.py
@abstractmethod
async def is_tracing_enabled(self) -> bool: ...

pause_generation abstractmethod async

pause_generation(
    *,
    mode: PauseMode = "abort",
    wait_for_inflight_requests: bool = False,
    clear_cache: bool = True,
) -> None

Pause new generation/encoding requests.

Parameters:

Name Type Description Default
mode PauseMode

How to handle in-flight requests: - "abort": Abort all in-flight requests immediately and return partial results with "abort" reason (default). - "wait": Wait for in-flight requests to complete. - "keep": Freeze requests in queue; they resume on :meth:resume_generation.

'abort'
wait_for_inflight_requests bool

DEPRECATED. Use mode="wait" instead.

False
clear_cache bool

DEPRECATED. Whether to clear KV and prefix caches after draining.

True
Source code in vllm/engine/protocol.py
@abstractmethod
async def pause_generation(
    self,
    *,
    mode: "PauseMode" = "abort",
    wait_for_inflight_requests: bool = False,
    clear_cache: bool = True,
) -> None:
    """Pause new generation/encoding requests.

    Args:
        mode: How to handle in-flight requests:
            - ``"abort"``: Abort all in-flight requests immediately
              and return partial results with "abort" reason (default).
            - ``"wait"``: Wait for in-flight requests to complete.
            - ``"keep"``: Freeze requests in queue; they resume on
              :meth:`resume_generation`.
        wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
        clear_cache: DEPRECATED. Whether to clear KV and prefix caches
            after draining.
    """
    ...

reset_encoder_cache abstractmethod async

reset_encoder_cache() -> None

Reset the encoder cache

Source code in vllm/engine/protocol.py
@abstractmethod
async def reset_encoder_cache(self) -> None:
    """Reset the encoder cache"""
    ...

reset_mm_cache abstractmethod async

reset_mm_cache() -> None

Reset the multi-modal cache

Source code in vllm/engine/protocol.py
@abstractmethod
async def reset_mm_cache(self) -> None:
    """Reset the multi-modal cache"""
    ...

reset_prefix_cache abstractmethod async

reset_prefix_cache(
    reset_running_requests: bool = False,
    reset_connector: bool = False,
) -> bool

Reset the prefix cache and optionally any configured connector cache

Source code in vllm/engine/protocol.py
@abstractmethod
async def reset_prefix_cache(
    self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
    """Reset the prefix cache and optionally any configured connector cache"""
    ...

resume_generation abstractmethod async

resume_generation() -> None

Resume accepting generation/encoding requests.

Source code in vllm/engine/protocol.py
@abstractmethod
async def resume_generation(self) -> None:
    """Resume accepting generation/encoding requests."""
    ...

scale_elastic_ep async

scale_elastic_ep(
    new_data_parallel_size: int, drain_timeout: int = 300
) -> None

Scale the engine

Source code in vllm/engine/protocol.py
async def scale_elastic_ep(
    self, new_data_parallel_size: int, drain_timeout: int = 300
) -> None:
    """Scale the engine"""
    raise NotImplementedError

sleep abstractmethod async

sleep(level: int = 1) -> None

Sleep the engine

Source code in vllm/engine/protocol.py
@abstractmethod
async def sleep(self, level: int = 1) -> None:
    """Sleep the engine"""
    ...

start_profile abstractmethod async

start_profile() -> None

Start profiling the engine

Source code in vllm/engine/protocol.py
@abstractmethod
async def start_profile(self) -> None:
    """Start profiling the engine"""
    ...

stop_profile abstractmethod async

stop_profile() -> None

Stop profiling the engine

Source code in vllm/engine/protocol.py
@abstractmethod
async def stop_profile(self) -> None:
    """Stop profiling the engine"""
    ...

update_weights async

update_weights(
    request: WeightTransferUpdateRequest,
) -> None

Batched weight update for RL training.

Source code in vllm/engine/protocol.py
async def update_weights(self, request: WeightTransferUpdateRequest) -> None:
    """Batched weight update for RL training."""
    raise NotImplementedError

wake_up abstractmethod async

wake_up(tags: list[str] | None = None) -> None

Wake up the engine

Source code in vllm/engine/protocol.py
@abstractmethod
async def wake_up(self, tags: list[str] | None = None) -> None:
    """Wake up the engine"""
    ...

ErrorResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/engine/protocol.py
class ErrorResponse(OpenAIBaseModel):
    error: ErrorInfo

error instance-attribute

error: ErrorInfo

OpenAIServingModels

Shared instance to hold data about the loaded base model(s) and adapters.

Handles the routes: - /v1/models - /v1/load_lora_adapter - /v1/unload_lora_adapter

Source code in vllm/entrypoints/openai/models/serving.py
class OpenAIServingModels:
    """Shared instance to hold data about the loaded base model(s) and adapters.

    Handles the routes:
    - /v1/models
    - /v1/load_lora_adapter
    - /v1/unload_lora_adapter
    """

    def __init__(
        self,
        engine_client: EngineClient,
        base_model_paths: list[BaseModelPath],
        *,
        lora_modules: list[LoRAModulePath] | None = None,
    ):
        super().__init__()

        self.engine_client = engine_client
        self.base_model_paths = base_model_paths

        self.static_lora_modules = lora_modules
        self.lora_requests: dict[str, LoRARequest] = {}
        self.lora_id_counter = AtomicCounter(0)

        self.lora_resolvers: list[LoRAResolver] = []
        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers():
            self.lora_resolvers.append(
                LoRAResolverRegistry.get_resolver(lora_resolver_name)
            )
        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

        self.input_processor = self.engine_client.input_processor
        self.io_processor = self.engine_client.io_processor
        self.renderer = self.engine_client.renderer
        self.model_config = self.engine_client.model_config
        self.max_model_len = self.model_config.max_model_len

    async def init_static_loras(self):
        """Loads all static LoRA modules.
        Raises if any fail to load"""
        if self.static_lora_modules is None:
            return
        for lora in self.static_lora_modules:
            load_request = LoadLoRAAdapterRequest(
                lora_path=lora.path, lora_name=lora.name
            )
            load_result = await self.load_lora_adapter(
                request=load_request, base_model_name=lora.base_model_name
            )
            if isinstance(load_result, ErrorResponse):
                raise ValueError(load_result.error.message)

    def is_base_model(self, model_name) -> bool:
        return any(model.name == model_name for model in self.base_model_paths)

    def model_name(self, lora_request: LoRARequest | None = None) -> str:
        """Returns the appropriate model name depending on the availability
        and support of the LoRA or base model.
        Parameters:
        - lora: LoRARequest that contain a base_model_name.
        Returns:
        - str: The name of the base model or the first available model path.
        """
        if lora_request is not None:
            return lora_request.lora_name
        return self.base_model_paths[0].name

    async def show_available_models(self) -> ModelList:
        """Show available models. This includes the base model and all
        adapters"""
        model_cards = [
            ModelCard(
                id=base_model.name,
                max_model_len=self.max_model_len,
                root=base_model.model_path,
                permission=[ModelPermission()],
            )
            for base_model in self.base_model_paths
        ]
        lora_cards = [
            ModelCard(
                id=lora.lora_name,
                root=lora.path,
                parent=lora.base_model_name
                if lora.base_model_name
                else self.base_model_paths[0].name,
                permission=[ModelPermission()],
            )
            for lora in self.lora_requests.values()
        ]
        model_cards.extend(lora_cards)
        return ModelList(data=model_cards)

    async def load_lora_adapter(
        self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
    ) -> ErrorResponse | str:
        lora_name = request.lora_name

        # Ensure atomicity based on the lora name
        async with self.lora_resolver_lock[lora_name]:
            error_check_ret = await self._check_load_lora_adapter_request(request)
            if error_check_ret is not None:
                return error_check_ret

            lora_path = request.lora_path
            lora_int_id = (
                self.lora_requests[lora_name].lora_int_id
                if lora_name in self.lora_requests
                else self.lora_id_counter.inc(1)
            )
            lora_request = LoRARequest(
                lora_name=lora_name,
                lora_int_id=lora_int_id,
                lora_path=lora_path,
                load_inplace=request.load_inplace,
            )
            if base_model_name is not None and self.is_base_model(base_model_name):
                lora_request.base_model_name = base_model_name

            # Validate that the adapter can be loaded into the engine
            # This will also preload it for incoming requests
            try:
                await self.engine_client.add_lora(lora_request)
            except Exception as e:
                error_type = "BadRequestError"
                status_code = HTTPStatus.BAD_REQUEST
                if "No adapter found" in str(e):
                    error_type = "NotFoundError"
                    status_code = HTTPStatus.NOT_FOUND

                return create_error_response(
                    message=str(e), err_type=error_type, status_code=status_code
                )

            self.lora_requests[lora_name] = lora_request
            logger.info(
                "Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path
            )
            return f"Success: LoRA adapter '{lora_name}' added successfully."

    async def unload_lora_adapter(
        self, request: UnloadLoRAAdapterRequest
    ) -> ErrorResponse | str:
        lora_name = request.lora_name

        # Ensure atomicity based on the lora name
        async with self.lora_resolver_lock[lora_name]:
            error_check_ret = await self._check_unload_lora_adapter_request(request)
            if error_check_ret is not None:
                return error_check_ret

            # Safe to delete now since we hold the lock
            del self.lora_requests[lora_name]
            logger.info("Removed LoRA adapter: name '%s'", lora_name)
            return f"Success: LoRA adapter '{lora_name}' removed successfully."

    async def _check_load_lora_adapter_request(
        self, request: LoadLoRAAdapterRequest
    ) -> ErrorResponse | None:
        # Check if both 'lora_name' and 'lora_path' are provided
        if not request.lora_name or not request.lora_path:
            return create_error_response(
                message="Both 'lora_name' and 'lora_path' must be provided.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        # If not loading inplace
        # Check if the lora adapter with the given name already exists
        if not request.load_inplace and request.lora_name in self.lora_requests:
            return create_error_response(
                message=f"The lora adapter '{request.lora_name}' has already been "
                "loaded. If you want to load the adapter in place, set 'load_inplace'"
                " to True.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        return None

    async def _check_unload_lora_adapter_request(
        self, request: UnloadLoRAAdapterRequest
    ) -> ErrorResponse | None:
        # Check if 'lora_name' is not provided return an error
        if not request.lora_name:
            return create_error_response(
                message="'lora_name' needs to be provided to unload a LoRA adapter.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        # Check if the lora adapter with the given name exists
        if request.lora_name not in self.lora_requests:
            return create_error_response(
                message=f"The lora adapter '{request.lora_name}' cannot be found.",
                err_type="NotFoundError",
                status_code=HTTPStatus.NOT_FOUND,
            )

        return None

    async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
        """Attempt to resolve a LoRA adapter using available resolvers.

        Args:
            lora_name: Name/identifier of the LoRA adapter

        Returns:
            LoRARequest if found and loaded successfully.
            ErrorResponse (404) if no resolver finds the adapter.
            ErrorResponse (400) if adapter(s) are found but none load.
        """
        async with self.lora_resolver_lock[lora_name]:
            # First check if this LoRA is already loaded
            if lora_name in self.lora_requests:
                return self.lora_requests[lora_name]

            base_model_name = self.model_config.model
            unique_id = self.lora_id_counter.inc(1)
            found_adapter = False

            # Try to resolve using available resolvers
            for resolver in self.lora_resolvers:
                lora_request = await resolver.resolve_lora(base_model_name, lora_name)

                if lora_request is not None:
                    found_adapter = True
                    lora_request.lora_int_id = unique_id

                    try:
                        await self.engine_client.add_lora(lora_request)
                        self.lora_requests[lora_name] = lora_request
                        logger.info(
                            "Resolved and loaded LoRA adapter '%s' using %s",
                            lora_name,
                            resolver.__class__.__name__,
                        )
                        return lora_request
                    except BaseException as e:
                        logger.warning(
                            "Failed to load LoRA '%s' resolved by %s: %s. "
                            "Trying next resolver.",
                            lora_name,
                            resolver.__class__.__name__,
                            e,
                        )
                        continue

            if found_adapter:
                # An adapter was found, but all attempts to load it failed.
                return create_error_response(
                    message=(
                        f"LoRA adapter '{lora_name}' was found but could not be loaded."
                    ),
                    err_type="BadRequestError",
                    status_code=HTTPStatus.BAD_REQUEST,
                )
            else:
                # No adapter was found
                return create_error_response(
                    message=f"LoRA adapter {lora_name} does not exist",
                    err_type="NotFoundError",
                    status_code=HTTPStatus.NOT_FOUND,
                )

base_model_paths instance-attribute

base_model_paths = base_model_paths

engine_client instance-attribute

engine_client = engine_client

input_processor instance-attribute

input_processor = input_processor

io_processor instance-attribute

io_processor = io_processor

lora_id_counter instance-attribute

lora_id_counter = AtomicCounter(0)

lora_requests instance-attribute

lora_requests: dict[str, LoRARequest] = {}

lora_resolver_lock instance-attribute

lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

lora_resolvers instance-attribute

lora_resolvers: list[LoRAResolver] = []

max_model_len instance-attribute

max_model_len = max_model_len

model_config instance-attribute

model_config = model_config

renderer instance-attribute

renderer = renderer

static_lora_modules instance-attribute

static_lora_modules = lora_modules

__init__

__init__(
    engine_client: EngineClient,
    base_model_paths: list[BaseModelPath],
    *,
    lora_modules: list[LoRAModulePath] | None = None,
)
Source code in vllm/entrypoints/openai/models/serving.py
def __init__(
    self,
    engine_client: EngineClient,
    base_model_paths: list[BaseModelPath],
    *,
    lora_modules: list[LoRAModulePath] | None = None,
):
    super().__init__()

    self.engine_client = engine_client
    self.base_model_paths = base_model_paths

    self.static_lora_modules = lora_modules
    self.lora_requests: dict[str, LoRARequest] = {}
    self.lora_id_counter = AtomicCounter(0)

    self.lora_resolvers: list[LoRAResolver] = []
    for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers():
        self.lora_resolvers.append(
            LoRAResolverRegistry.get_resolver(lora_resolver_name)
        )
    self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

    self.input_processor = self.engine_client.input_processor
    self.io_processor = self.engine_client.io_processor
    self.renderer = self.engine_client.renderer
    self.model_config = self.engine_client.model_config
    self.max_model_len = self.model_config.max_model_len

_check_load_lora_adapter_request async

_check_load_lora_adapter_request(
    request: LoadLoRAAdapterRequest,
) -> ErrorResponse | None
Source code in vllm/entrypoints/openai/models/serving.py
async def _check_load_lora_adapter_request(
    self, request: LoadLoRAAdapterRequest
) -> ErrorResponse | None:
    # Check if both 'lora_name' and 'lora_path' are provided
    if not request.lora_name or not request.lora_path:
        return create_error_response(
            message="Both 'lora_name' and 'lora_path' must be provided.",
            err_type="InvalidUserInput",
            status_code=HTTPStatus.BAD_REQUEST,
        )

    # If not loading inplace
    # Check if the lora adapter with the given name already exists
    if not request.load_inplace and request.lora_name in self.lora_requests:
        return create_error_response(
            message=f"The lora adapter '{request.lora_name}' has already been "
            "loaded. If you want to load the adapter in place, set 'load_inplace'"
            " to True.",
            err_type="InvalidUserInput",
            status_code=HTTPStatus.BAD_REQUEST,
        )

    return None

_check_unload_lora_adapter_request async

_check_unload_lora_adapter_request(
    request: UnloadLoRAAdapterRequest,
) -> ErrorResponse | None
Source code in vllm/entrypoints/openai/models/serving.py
async def _check_unload_lora_adapter_request(
    self, request: UnloadLoRAAdapterRequest
) -> ErrorResponse | None:
    # Check if 'lora_name' is not provided return an error
    if not request.lora_name:
        return create_error_response(
            message="'lora_name' needs to be provided to unload a LoRA adapter.",
            err_type="InvalidUserInput",
            status_code=HTTPStatus.BAD_REQUEST,
        )

    # Check if the lora adapter with the given name exists
    if request.lora_name not in self.lora_requests:
        return create_error_response(
            message=f"The lora adapter '{request.lora_name}' cannot be found.",
            err_type="NotFoundError",
            status_code=HTTPStatus.NOT_FOUND,
        )

    return None

init_static_loras async

init_static_loras()

Loads all static LoRA modules. Raises if any fail to load

Source code in vllm/entrypoints/openai/models/serving.py
async def init_static_loras(self):
    """Loads all static LoRA modules.
    Raises if any fail to load"""
    if self.static_lora_modules is None:
        return
    for lora in self.static_lora_modules:
        load_request = LoadLoRAAdapterRequest(
            lora_path=lora.path, lora_name=lora.name
        )
        load_result = await self.load_lora_adapter(
            request=load_request, base_model_name=lora.base_model_name
        )
        if isinstance(load_result, ErrorResponse):
            raise ValueError(load_result.error.message)

is_base_model

is_base_model(model_name) -> bool
Source code in vllm/entrypoints/openai/models/serving.py
def is_base_model(self, model_name) -> bool:
    return any(model.name == model_name for model in self.base_model_paths)

load_lora_adapter async

load_lora_adapter(
    request: LoadLoRAAdapterRequest,
    base_model_name: str | None = None,
) -> ErrorResponse | str
Source code in vllm/entrypoints/openai/models/serving.py
async def load_lora_adapter(
    self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
) -> ErrorResponse | str:
    lora_name = request.lora_name

    # Ensure atomicity based on the lora name
    async with self.lora_resolver_lock[lora_name]:
        error_check_ret = await self._check_load_lora_adapter_request(request)
        if error_check_ret is not None:
            return error_check_ret

        lora_path = request.lora_path
        lora_int_id = (
            self.lora_requests[lora_name].lora_int_id
            if lora_name in self.lora_requests
            else self.lora_id_counter.inc(1)
        )
        lora_request = LoRARequest(
            lora_name=lora_name,
            lora_int_id=lora_int_id,
            lora_path=lora_path,
            load_inplace=request.load_inplace,
        )
        if base_model_name is not None and self.is_base_model(base_model_name):
            lora_request.base_model_name = base_model_name

        # Validate that the adapter can be loaded into the engine
        # This will also preload it for incoming requests
        try:
            await self.engine_client.add_lora(lora_request)
        except Exception as e:
            error_type = "BadRequestError"
            status_code = HTTPStatus.BAD_REQUEST
            if "No adapter found" in str(e):
                error_type = "NotFoundError"
                status_code = HTTPStatus.NOT_FOUND

            return create_error_response(
                message=str(e), err_type=error_type, status_code=status_code
            )

        self.lora_requests[lora_name] = lora_request
        logger.info(
            "Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path
        )
        return f"Success: LoRA adapter '{lora_name}' added successfully."

model_name

model_name(lora_request: LoRARequest | None = None) -> str

Returns the appropriate model name depending on the availability and support of the LoRA or base model. Parameters: - lora: LoRARequest that contain a base_model_name. Returns: - str: The name of the base model or the first available model path.

Source code in vllm/entrypoints/openai/models/serving.py
def model_name(self, lora_request: LoRARequest | None = None) -> str:
    """Returns the appropriate model name depending on the availability
    and support of the LoRA or base model.
    Parameters:
    - lora: LoRARequest that contain a base_model_name.
    Returns:
    - str: The name of the base model or the first available model path.
    """
    if lora_request is not None:
        return lora_request.lora_name
    return self.base_model_paths[0].name

resolve_lora async

resolve_lora(lora_name: str) -> LoRARequest | ErrorResponse

Attempt to resolve a LoRA adapter using available resolvers.

Parameters:

Name Type Description Default
lora_name str

Name/identifier of the LoRA adapter

required

Returns:

Type Description
LoRARequest | ErrorResponse

LoRARequest if found and loaded successfully.

LoRARequest | ErrorResponse

ErrorResponse (404) if no resolver finds the adapter.

LoRARequest | ErrorResponse

ErrorResponse (400) if adapter(s) are found but none load.

Source code in vllm/entrypoints/openai/models/serving.py
async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
    """Attempt to resolve a LoRA adapter using available resolvers.

    Args:
        lora_name: Name/identifier of the LoRA adapter

    Returns:
        LoRARequest if found and loaded successfully.
        ErrorResponse (404) if no resolver finds the adapter.
        ErrorResponse (400) if adapter(s) are found but none load.
    """
    async with self.lora_resolver_lock[lora_name]:
        # First check if this LoRA is already loaded
        if lora_name in self.lora_requests:
            return self.lora_requests[lora_name]

        base_model_name = self.model_config.model
        unique_id = self.lora_id_counter.inc(1)
        found_adapter = False

        # Try to resolve using available resolvers
        for resolver in self.lora_resolvers:
            lora_request = await resolver.resolve_lora(base_model_name, lora_name)

            if lora_request is not None:
                found_adapter = True
                lora_request.lora_int_id = unique_id

                try:
                    await self.engine_client.add_lora(lora_request)
                    self.lora_requests[lora_name] = lora_request
                    logger.info(
                        "Resolved and loaded LoRA adapter '%s' using %s",
                        lora_name,
                        resolver.__class__.__name__,
                    )
                    return lora_request
                except BaseException as e:
                    logger.warning(
                        "Failed to load LoRA '%s' resolved by %s: %s. "
                        "Trying next resolver.",
                        lora_name,
                        resolver.__class__.__name__,
                        e,
                    )
                    continue

        if found_adapter:
            # An adapter was found, but all attempts to load it failed.
            return create_error_response(
                message=(
                    f"LoRA adapter '{lora_name}' was found but could not be loaded."
                ),
                err_type="BadRequestError",
                status_code=HTTPStatus.BAD_REQUEST,
            )
        else:
            # No adapter was found
            return create_error_response(
                message=f"LoRA adapter {lora_name} does not exist",
                err_type="NotFoundError",
                status_code=HTTPStatus.NOT_FOUND,
            )

show_available_models async

show_available_models() -> ModelList

Show available models. This includes the base model and all adapters

Source code in vllm/entrypoints/openai/models/serving.py
async def show_available_models(self) -> ModelList:
    """Show available models. This includes the base model and all
    adapters"""
    model_cards = [
        ModelCard(
            id=base_model.name,
            max_model_len=self.max_model_len,
            root=base_model.model_path,
            permission=[ModelPermission()],
        )
        for base_model in self.base_model_paths
    ]
    lora_cards = [
        ModelCard(
            id=lora.lora_name,
            root=lora.path,
            parent=lora.base_model_name
            if lora.base_model_name
            else self.base_model_paths[0].name,
            permission=[ModelPermission()],
        )
        for lora in self.lora_requests.values()
    ]
    model_cards.extend(lora_cards)
    return ModelList(data=model_cards)

unload_lora_adapter async

unload_lora_adapter(
    request: UnloadLoRAAdapterRequest,
) -> ErrorResponse | str
Source code in vllm/entrypoints/openai/models/serving.py
async def unload_lora_adapter(
    self, request: UnloadLoRAAdapterRequest
) -> ErrorResponse | str:
    lora_name = request.lora_name

    # Ensure atomicity based on the lora name
    async with self.lora_resolver_lock[lora_name]:
        error_check_ret = await self._check_unload_lora_adapter_request(request)
        if error_check_ret is not None:
            return error_check_ret

        # Safe to delete now since we hold the lock
        del self.lora_requests[lora_name]
        logger.info("Removed LoRA adapter: name '%s'", lora_name)
        return f"Success: LoRA adapter '{lora_name}' removed successfully."

OpenAIServingTranscription

Bases: OpenAISpeechToText

Handles transcription requests.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py
class OpenAIServingTranscription(OpenAISpeechToText):
    """Handles transcription requests."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            task_type="transcribe",
            log_error_stack=log_error_stack,
            enable_force_include_usage=enable_force_include_usage,
        )

    async def create_transcription(
        self,
        audio_data: bytes,
        request: TranscriptionRequest,
        raw_request: Request | None = None,
    ) -> (
        TranscriptionResponse
        | TranscriptionResponseVerbose
        | AsyncGenerator[str, None]
        | ErrorResponse
    ):
        """Transcription API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/audio/createTranscription
        for the API specification. This API mimics the OpenAI transcription API.
        """
        return await self._create_speech_to_text(
            audio_data=audio_data,
            request=request,
            raw_request=raw_request,
            response_class=(
                TranscriptionResponseVerbose
                if request.response_format == "verbose_json"
                else TranscriptionResponse
            ),
            stream_generator_method=self.transcription_stream_generator,
        )

    async def transcription_stream_generator(
        self,
        request: TranscriptionRequest,
        result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
    ) -> AsyncGenerator[str, None]:
        generator = self._speech_to_text_stream_generator(
            request=request,
            list_result_generator=result_generator,
            request_id=request_id,
            request_metadata=request_metadata,
            audio_duration_s=audio_duration_s,
            chunk_object_type="transcription.chunk",
            response_stream_choice_class=TranscriptionResponseStreamChoice,
            stream_response_class=TranscriptionStreamResponse,
        )
        async for chunk in generator:
            yield chunk

__init__

__init__(
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
)
Source code in vllm/entrypoints/openai/speech_to_text/serving.py
def __init__(
    self,
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
):
    super().__init__(
        engine_client=engine_client,
        models=models,
        request_logger=request_logger,
        return_tokens_as_token_ids=return_tokens_as_token_ids,
        task_type="transcribe",
        log_error_stack=log_error_stack,
        enable_force_include_usage=enable_force_include_usage,
    )

create_transcription async

create_transcription(
    audio_data: bytes,
    request: TranscriptionRequest,
    raw_request: Request | None = None,
) -> (
    TranscriptionResponse
    | TranscriptionResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
)

Transcription API similar to OpenAI's API.

See https://platform.openai.com/docs/api-reference/audio/createTranscription for the API specification. This API mimics the OpenAI transcription API.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py
async def create_transcription(
    self,
    audio_data: bytes,
    request: TranscriptionRequest,
    raw_request: Request | None = None,
) -> (
    TranscriptionResponse
    | TranscriptionResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
):
    """Transcription API similar to OpenAI's API.

    See https://platform.openai.com/docs/api-reference/audio/createTranscription
    for the API specification. This API mimics the OpenAI transcription API.
    """
    return await self._create_speech_to_text(
        audio_data=audio_data,
        request=request,
        raw_request=raw_request,
        response_class=(
            TranscriptionResponseVerbose
            if request.response_format == "verbose_json"
            else TranscriptionResponse
        ),
        stream_generator_method=self.transcription_stream_generator,
    )

transcription_stream_generator async

transcription_stream_generator(
    request: TranscriptionRequest,
    result_generator: list[
        AsyncGenerator[RequestOutput, None]
    ],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
) -> AsyncGenerator[str, None]
Source code in vllm/entrypoints/openai/speech_to_text/serving.py
async def transcription_stream_generator(
    self,
    request: TranscriptionRequest,
    result_generator: list[AsyncGenerator[RequestOutput, None]],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
) -> AsyncGenerator[str, None]:
    generator = self._speech_to_text_stream_generator(
        request=request,
        list_result_generator=result_generator,
        request_id=request_id,
        request_metadata=request_metadata,
        audio_duration_s=audio_duration_s,
        chunk_object_type="transcription.chunk",
        response_stream_choice_class=TranscriptionResponseStreamChoice,
        stream_response_class=TranscriptionStreamResponse,
    )
    async for chunk in generator:
        yield chunk

OpenAIServingTranslation

Bases: OpenAISpeechToText

Handles translation requests.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py
class OpenAIServingTranslation(OpenAISpeechToText):
    """Handles translation requests."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            task_type="translate",
            log_error_stack=log_error_stack,
            enable_force_include_usage=enable_force_include_usage,
        )

    async def create_translation(
        self,
        audio_data: bytes,
        request: TranslationRequest,
        raw_request: Request | None = None,
    ) -> (
        TranslationResponse
        | TranslationResponseVerbose
        | AsyncGenerator[str, None]
        | ErrorResponse
    ):
        """Translation API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/audio/createTranslation
        for the API specification. This API mimics the OpenAI translation API.
        """
        return await self._create_speech_to_text(
            audio_data=audio_data,
            request=request,
            raw_request=raw_request,
            response_class=(
                TranslationResponseVerbose
                if request.response_format == "verbose_json"
                else TranslationResponse
            ),
            stream_generator_method=self.translation_stream_generator,
        )

    async def translation_stream_generator(
        self,
        request: TranslationRequest,
        result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
    ) -> AsyncGenerator[str, None]:
        generator = self._speech_to_text_stream_generator(
            request=request,
            list_result_generator=result_generator,
            request_id=request_id,
            request_metadata=request_metadata,
            audio_duration_s=audio_duration_s,
            chunk_object_type="translation.chunk",
            response_stream_choice_class=TranslationResponseStreamChoice,
            stream_response_class=TranslationStreamResponse,
        )
        async for chunk in generator:
            yield chunk

__init__

__init__(
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
)
Source code in vllm/entrypoints/openai/speech_to_text/serving.py
def __init__(
    self,
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
):
    super().__init__(
        engine_client=engine_client,
        models=models,
        request_logger=request_logger,
        return_tokens_as_token_ids=return_tokens_as_token_ids,
        task_type="translate",
        log_error_stack=log_error_stack,
        enable_force_include_usage=enable_force_include_usage,
    )

create_translation async

create_translation(
    audio_data: bytes,
    request: TranslationRequest,
    raw_request: Request | None = None,
) -> (
    TranslationResponse
    | TranslationResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
)

Translation API similar to OpenAI's API.

See https://platform.openai.com/docs/api-reference/audio/createTranslation for the API specification. This API mimics the OpenAI translation API.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py
async def create_translation(
    self,
    audio_data: bytes,
    request: TranslationRequest,
    raw_request: Request | None = None,
) -> (
    TranslationResponse
    | TranslationResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
):
    """Translation API similar to OpenAI's API.

    See https://platform.openai.com/docs/api-reference/audio/createTranslation
    for the API specification. This API mimics the OpenAI translation API.
    """
    return await self._create_speech_to_text(
        audio_data=audio_data,
        request=request,
        raw_request=raw_request,
        response_class=(
            TranslationResponseVerbose
            if request.response_format == "verbose_json"
            else TranslationResponse
        ),
        stream_generator_method=self.translation_stream_generator,
    )

translation_stream_generator async

translation_stream_generator(
    request: TranslationRequest,
    result_generator: list[
        AsyncGenerator[RequestOutput, None]
    ],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
) -> AsyncGenerator[str, None]
Source code in vllm/entrypoints/openai/speech_to_text/serving.py
async def translation_stream_generator(
    self,
    request: TranslationRequest,
    result_generator: list[AsyncGenerator[RequestOutput, None]],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
) -> AsyncGenerator[str, None]:
    generator = self._speech_to_text_stream_generator(
        request=request,
        list_result_generator=result_generator,
        request_id=request_id,
        request_metadata=request_metadata,
        audio_duration_s=audio_duration_s,
        chunk_object_type="translation.chunk",
        response_stream_choice_class=TranslationResponseStreamChoice,
        stream_response_class=TranslationStreamResponse,
    )
    async for chunk in generator:
        yield chunk

OpenAISpeechToText

Bases: OpenAIServing

Base class for speech-to-text operations like transcription and translation.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
class OpenAISpeechToText(OpenAIServing):
    """Base class for speech-to-text operations like transcription and
    translation."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        task_type: Literal["transcribe", "translate"] = "transcribe",
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            log_error_stack=log_error_stack,
        )

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
        self.task_type: Final = task_type

        self.asr_config = self.model_cls.get_speech_to_text_config(
            self.model_config, task_type
        )

        self.enable_force_include_usage = enable_force_include_usage

        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
        if self.model_cls.supports_segment_timestamp:
            self.tokenizer = cast(
                PreTrainedTokenizerBase,
                get_tokenizer(
                    tokenizer_name=self.model_config.tokenizer,
                    tokenizer_mode=self.model_config.tokenizer_mode,
                ),
            )

        if self.default_sampling_params:
            logger.info(
                "Overwriting default completion sampling param with: %s",
                self.default_sampling_params,
            )

        # Warm up audio preprocessing to avoid first-request latency
        self._warmup_audio_preprocessing()
        # Warm up input processor with dummy audio
        self._warmup_input_processor()

    def _warmup_audio_preprocessing(self) -> None:
        """Warm up audio processing libraries to avoid first-request latency.

        The first call to librosa functions (load, get_duration, mel-spectrogram)
        triggers JIT compilation and library initialization which can take ~7s.
        This method warms up these operations during server initialization.
        """
        # Skip warmup if librosa is not installed (optional dependency)
        if isinstance(librosa, PlaceholderModule):
            return

        # Skip warmup if model doesn't support transcription
        if not supports_transcription(self.model_cls):
            return

        if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
            return

        try:
            warmup_start = time.perf_counter()
            logger.info("Warming up audio preprocessing libraries...")

            # Create a minimal dummy audio (1 second of silence at target sample rate)
            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

            # Warm up librosa.load by using librosa functions on the dummy data
            # This initializes FFTW, numba JIT, and other audio processing libraries
            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)

            # Warm up mel-spectrogram computation with model-specific parameters
            from vllm.transformers_utils.processor import cached_processor_from_config

            processor = cached_processor_from_config(self.model_config)
            feature_extractor = None
            if hasattr(processor, "feature_extractor"):
                feature_extractor = processor.feature_extractor
            elif hasattr(processor, "audio_processor"):
                # For models like GraniteSpeech that use audio_processor
                audio_proc = processor.audio_processor
                if hasattr(audio_proc, "feature_extractor"):
                    feature_extractor = audio_proc.feature_extractor
                # If audio_processor doesn't have feature_extractor,
                # skip mel-spectrogram warmup for these models

            if feature_extractor is not None:
                _ = librosa.feature.melspectrogram(
                    y=dummy_audio,
                    sr=self.asr_config.sample_rate,
                    n_mels=getattr(feature_extractor, "n_mels", 128),
                    n_fft=getattr(feature_extractor, "n_fft", 400),
                    hop_length=getattr(feature_extractor, "hop_length", 160),
                )

            warmup_elapsed = time.perf_counter() - warmup_start
            logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
        except Exception:
            # Don't fail initialization if warmup fails - log exception and continue
            logger.exception(
                "Audio preprocessing warmup failed (non-fatal): %s. "
                "First request may experience higher latency.",
            )

    def _warmup_input_processor(self) -> None:
        """Warm up input processor with dummy audio to avoid first-request latency.

        The first call to input_processor.process_inputs() with multimodal audio
        triggers multimodal processing initialization which can take ~2.5s.
        This method processes a dummy audio request to warm up the pipeline.
        """
        # Skip warmup if model doesn't support transcription
        if not supports_transcription(self.model_cls):
            return

        # Only warm up if model supports transcription methods
        if not hasattr(self.model_cls, "get_generation_prompt"):
            return

        try:
            from vllm.sampling_params import SamplingParams

            warmup_start = time.perf_counter()
            logger.info("Warming up multimodal input processor...")

            # Create minimal dummy audio (1 second of silence)
            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

            # Use the same method that _preprocess_speech_to_text uses
            # to create the prompt
            dummy_prompt = self.model_cls.get_generation_prompt(
                audio=dummy_audio,
                stt_config=self.asr_config,
                model_config=self.model_config,
                language="en",
                task_type=self.task_type,
                request_prompt="",
                to_language=None,
            )

            # Create minimal sampling params
            dummy_params = SamplingParams(
                max_tokens=1,
                temperature=0.0,
                skip_clone=True,  # Internal warmup, safe to skip clone
            )

            # Process the dummy input through the input processor
            # This will trigger all the multimodal processing initialization
            _ = self.input_processor.process_inputs(
                request_id="warmup",
                prompt=dummy_prompt,
                params=dummy_params,
            )

            warmup_elapsed = time.perf_counter() - warmup_start
            logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
        except Exception:
            # Don't fail initialization if warmup fails - log warning and continue
            logger.exception(
                "Input processor warmup failed (non-fatal): %s. "
                "First request may experience higher latency."
            )

    @cached_property
    def model_cls(self) -> type[SupportsTranscription]:
        from vllm.model_executor.model_loader import get_model_cls

        model_cls = get_model_cls(self.model_config)
        return cast(type[SupportsTranscription], model_cls)

    async def _preprocess_speech_to_text(
        self,
        request: SpeechToTextRequest,
        audio_data: bytes,
    ) -> tuple[list[PromptType], float]:
        # Validate request
        language = self.model_cls.validate_language(request.language)
        # Skip to_language validation to avoid extra logging for Whisper.
        to_language = (
            self.model_cls.validate_language(request.to_language)
            if request.to_language
            else None
        )

        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
            raise VLLMValidationError(
                "Maximum file size exceeded",
                parameter="audio_filesize_mb",
                value=len(audio_data) / 1024**2,
            )

        with io.BytesIO(audio_data) as bytes_:
            # NOTE resample to model SR here for efficiency. This is also a
            # pre-requisite for chunking, as it assumes Whisper SR.
            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)

        duration = librosa.get_duration(y=y, sr=sr)
        do_split_audio = (
            self.asr_config.allow_audio_chunking
            and duration > self.asr_config.max_audio_clip_s
        )
        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
        prompts = []
        for chunk in chunks:
            # The model has control over the construction, as long as it
            # returns a valid PromptType.
            prompt = self.model_cls.get_generation_prompt(
                audio=chunk,
                stt_config=self.asr_config,
                model_config=self.model_config,
                language=language,
                task_type=self.task_type,
                request_prompt=request.prompt,
                to_language=to_language,
            )
            if request.response_format == "verbose_json":
                prompt = self._preprocess_verbose_prompt(parse_enc_dec_prompt(prompt))

            prompts.append(prompt)

        return prompts, duration

    def _preprocess_verbose_prompt(self, prompt: EncoderDecoderDictPrompt):
        dec_prompt = prompt["decoder_prompt"]

        if not (isinstance(dec_prompt, dict) and "prompt" in dec_prompt):
            raise VLLMValidationError(
                "Expected decoder_prompt to contain text",
                parameter="decoder_prompt",
                value=type(dec_prompt).__name__,
            )

        dec_prompt["prompt"] = dec_prompt["prompt"].replace(
            "<|notimestamps|>", "<|0.00|>"
        )

        return prompt

    def _get_verbose_segments(
        self,
        tokens: tuple,
        log_probs: FlatLogprobs | list[dict[int, Logprob]],
        request: SpeechToTextRequest,
        segment_class: type[SpeechToTextSegment],
        start_time: float = 0,
    ) -> list[SpeechToTextSegment]:
        """
        Convert tokens to verbose segments.

        This method expects the model to produce
        timestamps as tokens (similar to Whisper).
        If the tokens do not include timestamp information,
        the segments may not be generated correctly.

        Note: No_speech_prob field is not supported
        in this implementation and will be None. See docs for details.
        """
        BASE_OFFSET = 0.02
        init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
        if tokens[-1] == self.tokenizer.eos_token_id:
            tokens = tokens[:-1]

        tokens_with_start = (init_token,) + tokens
        segments: list[SpeechToTextSegment] = []
        last_timestamp_start = 0

        if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
            tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
        avg_logprob = 0.0
        for idx in range(1, len(tokens_with_start)):
            # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
            # If the ordering is violated, this slicing may produce incorrect results.
            token = tokens_with_start[idx]
            if token >= init_token and tokens_with_start[idx - 1] >= init_token:
                sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
                start_timestamp = sliced_timestamp_tokens[0] - init_token
                end_timestamp = sliced_timestamp_tokens[-1] - init_token
                text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
                text_bytes = text.encode("utf-8")

                casting_segment = cast(
                    SpeechToTextSegment,
                    segment_class(
                        id=len(segments),
                        seek=start_time,
                        start=start_time + BASE_OFFSET * start_timestamp,
                        end=start_time + BASE_OFFSET * end_timestamp,
                        temperature=request.temperature,
                        text=text,
                        # The compression ratio measures
                        # how compressible the generated text is.
                        # A higher ratio indicates more repetitive content,
                        # which is a strong sign of hallucination in outputs.
                        compression_ratio=len(text_bytes)
                        / len(zlib.compress(text_bytes)),
                        tokens=sliced_timestamp_tokens[1:-1],
                        avg_logprob=avg_logprob / (idx - last_timestamp_start),
                    ),
                )
                segments.append(casting_segment)
                last_timestamp_start = idx
                avg_logprob = 0
            else:
                avg_logprob += log_probs[idx - 1][token].logprob
        return segments

    async def _create_speech_to_text(
        self,
        audio_data: bytes,
        request: SpeechToTextRequest,
        raw_request: Request,
        response_class: type[ResponseType],
        stream_generator_method: Callable[..., AsyncGenerator[str, None]],
    ) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
        """Base method for speech-to-text operations like transcription and
        translation."""
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

        if request.response_format not in ["text", "json", "verbose_json"]:
            return self.create_error_response(
                "Currently only support response_format: "
                "`text`, `json` or `verbose_json`"
            )

        if (
            request.response_format == "verbose_json"
            and not self.model_cls.supports_segment_timestamp
        ):
            return self.create_error_response(
                f"Currently do not support verbose_json for {request.model}"
            )

        if request.response_format == "verbose_json" and request.stream:
            return self.create_error_response(
                "verbose_json format doesn't support streaming case"
            )
        request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)

            prompts, duration_s = await self._preprocess_speech_to_text(
                request=request,
                audio_data=audio_data,
            )

        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(e)

        list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
        try:
            # Unlike most decoder-only models, whisper generation length is not
            # constrained by the size of the input audio, which is mapped to a
            # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
            # generated by respecting the extra completion tokens arg.
            if request.max_completion_tokens is None:
                default_max_tokens = self.model_config.max_model_len
            else:
                default_max_tokens = min(
                    self.model_config.max_model_len, request.max_completion_tokens
                )
            sampling_params = request.to_sampling_params(
                default_max_tokens, self.default_sampling_params
            )
            if request.response_format == "verbose_json":
                sampling_params.logprobs = 1

            self._log_inputs(
                request_id,
                # It will not display special tokens like <|startoftranscript|>
                request.prompt,
                params=sampling_params,
                lora_request=lora_request,
            )

            list_result_generator = [
                self.engine_client.generate(
                    prompt,
                    sampling_params,
                    f"{request_id}_{i}",
                    lora_request=lora_request,
                )
                for i, prompt in enumerate(prompts)
            ]
        except ValueError as e:
            return self.create_error_response(e)

        if request.stream:
            return stream_generator_method(
                request, list_result_generator, request_id, request_metadata, duration_s
            )
        # Non-streaming response.
        total_segments = []
        text_parts = []
        try:
            assert list_result_generator is not None
            segments_types: dict[str, type[SpeechToTextSegment]] = {
                "transcribe": TranscriptionSegment,
                "translate": TranslationSegment,
            }
            segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
            text = ""
            chunk_size_in_s = self.asr_config.max_audio_clip_s
            if chunk_size_in_s is None:
                assert len(list_result_generator) == 1, (
                    "`max_audio_clip_s` is set to None, audio cannot be chunked"
                )
            for idx, result_generator in enumerate(list_result_generator):
                start_time = (
                    float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
                )
                async for op in result_generator:
                    if request.response_format == "verbose_json":
                        assert op.outputs[0].logprobs
                        segments: list[SpeechToTextSegment] = (
                            self._get_verbose_segments(
                                tokens=tuple(op.outputs[0].token_ids),
                                segment_class=segment_class,
                                request=request,
                                start_time=start_time,
                                log_probs=op.outputs[0].logprobs,
                            )
                        )

                        total_segments.extend(segments)
                        text_parts.extend([seg.text for seg in segments])
                    else:
                        raw_text = op.outputs[0].text
                        text_parts.append(self.model_cls.post_process_output(raw_text))
            text = "".join(text_parts)
            if self.task_type == "transcribe":
                final_response: ResponseType
                # add usage in TranscriptionResponse.
                usage = {
                    "type": "duration",
                    # rounded up as per openAI specs
                    "seconds": int(math.ceil(duration_s)),
                }
                if request.response_format != "verbose_json":
                    final_response = cast(
                        T, TranscriptionResponse(text=text, usage=usage)
                    )
                else:
                    final_response = cast(
                        V,
                        TranscriptionResponseVerbose(
                            text=text,
                            language=request.language,
                            duration=str(duration_s),
                            segments=total_segments,
                        ),
                    )
            else:
                # no usage in response for translation task
                if request.response_format != "verbose_json":
                    final_response = cast(T, TranslationResponse(text=text))
                else:
                    final_response = cast(
                        V,
                        TranslationResponseVerbose(
                            text=text,
                            language=request.language,
                            duration=str(duration_s),
                            segments=total_segments,
                        ),
                    )
            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e:
            return self.create_error_response(e)

    async def _speech_to_text_stream_generator(
        self,
        request: SpeechToTextRequest,
        list_result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
        chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
        response_stream_choice_class: type[TranscriptionResponseStreamChoice]
        | type[TranslationResponseStreamChoice],
        stream_response_class: type[TranscriptionStreamResponse]
        | type[TranslationStreamResponse],
    ) -> AsyncGenerator[str, None]:
        created_time = int(time.time())
        model_name = request.model

        completion_tokens = 0
        num_prompt_tokens = 0

        include_usage = self.enable_force_include_usage or request.stream_include_usage
        include_continuous_usage = (
            request.stream_continuous_usage_stats
            if include_usage and request.stream_continuous_usage_stats
            else False
        )

        try:
            for result_generator in list_result_generator:
                async for res in result_generator:
                    # On first result.
                    if res.prompt_token_ids is not None:
                        num_prompt_tokens = len(res.prompt_token_ids)
                        if audio_tokens := self.model_cls.get_num_audio_tokens(
                            audio_duration_s, self.asr_config, self.model_config
                        ):
                            num_prompt_tokens += audio_tokens

                    # We need to do it here, because if there are exceptions in
                    # the result_generator, it needs to be sent as the FIRST
                    # response (by the try...catch).

                    # Just one output (n=1) supported.
                    assert len(res.outputs) == 1
                    output = res.outputs[0]

                    # TODO: For models that output structured formats (e.g.,
                    # Qwen3-ASR with "language X<asr_text>" prefix), streaming
                    # would need buffering to strip the prefix properly since
                    # deltas may split the tag across chunks.
                    delta_message = DeltaMessage(content=output.text)
                    completion_tokens += len(output.token_ids)

                    if output.finish_reason is None:
                        # Still generating, send delta update.
                        choice_data = response_stream_choice_class(delta=delta_message)
                    else:
                        # Model is finished generating.
                        choice_data = response_stream_choice_class(
                            delta=delta_message,
                            finish_reason=output.finish_reason,
                            stop_reason=output.stop_reason,
                        )

                    chunk = stream_response_class(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name,
                    )

                    # handle usage stats if requested & if continuous
                    if include_continuous_usage:
                        chunk.usage = UsageInfo(
                            prompt_tokens=num_prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=num_prompt_tokens + completion_tokens,
                        )

                    data = chunk.model_dump_json(exclude_unset=True)
                    yield f"data: {data}\n\n"

            # Once the final token is handled, if stream_options.include_usage
            # is sent, send the usage.
            if include_usage:
                final_usage = UsageInfo(
                    prompt_tokens=num_prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
                )

                final_usage_chunk = stream_response_class(
                    id=request_id,
                    object=chunk_object_type,
                    created=created_time,
                    choices=[],
                    model=model_name,
                    usage=final_usage,
                )
                final_usage_data = final_usage_chunk.model_dump_json(
                    exclude_unset=True, exclude_none=True
                )
                yield f"data: {final_usage_data}\n\n"

            # report to FastAPI middleware aggregate usage across all choices
            request_metadata.final_usage_info = UsageInfo(
                prompt_tokens=num_prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=num_prompt_tokens + completion_tokens,
            )

        except Exception as e:
            logger.exception("Error in %s stream generator.", self.task_type)
            data = self.create_streaming_error_response(e)
            yield f"data: {data}\n\n"
        # Send the final done message after all response.n are finished
        yield "data: [DONE]\n\n"

    def _split_audio(
        self, audio_data: np.ndarray, sample_rate: int
    ) -> list[np.ndarray]:
        assert self.asr_config.max_audio_clip_s is not None, (
            f"{self.asr_config.max_audio_clip_s=} cannot be None to"
            " split audio into chunks."
        )
        chunk_size = sample_rate * self.asr_config.max_audio_clip_s
        overlap_size = sample_rate * self.asr_config.overlap_chunk_second
        chunks = []
        i = 0
        while i < audio_data.shape[-1]:
            if i + chunk_size >= audio_data.shape[-1]:
                # handle last chunk
                chunks.append(audio_data[..., i:])
                break

            # Find the best split point in the overlap region
            search_start = i + chunk_size - overlap_size
            search_end = min(i + chunk_size, audio_data.shape[-1])
            split_point = self._find_split_point(audio_data, search_start, search_end)

            # Extract chunk up to the split point
            chunks.append(audio_data[..., i:split_point])
            i = split_point
        return chunks

    def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
        """Find the best point to split audio by
        looking for silence or low amplitude.
        Args:
            wav: Audio tensor [1, T]
            start_idx: Start index of search region
            end_idx: End index of search region
        Returns:
            Index of best splitting point
        """
        segment = wav[start_idx:end_idx]

        # Calculate RMS energy in small windows
        min_energy = math.inf
        quietest_idx = 0
        min_energy_window = self.asr_config.min_energy_split_window_size
        assert min_energy_window is not None
        for i in range(0, len(segment) - min_energy_window, min_energy_window):
            window = segment[i : i + min_energy_window]
            energy = (window**2).mean() ** 0.5
            if energy < min_energy:
                quietest_idx = i + start_idx
                min_energy = energy
        return quietest_idx

asr_config instance-attribute

asr_config = get_speech_to_text_config(
    model_config, task_type
)

default_sampling_params instance-attribute

default_sampling_params = get_diff_sampling_param()

enable_force_include_usage instance-attribute

enable_force_include_usage = enable_force_include_usage

max_audio_filesize_mb instance-attribute

max_audio_filesize_mb = VLLM_MAX_AUDIO_CLIP_FILESIZE_MB

model_cls cached property

task_type instance-attribute

task_type: Final = task_type

tokenizer instance-attribute

tokenizer = cast(
    PreTrainedTokenizerBase,
    get_tokenizer(
        tokenizer_name=tokenizer,
        tokenizer_mode=tokenizer_mode,
    ),
)

__init__

__init__(
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    task_type: Literal[
        "transcribe", "translate"
    ] = "transcribe",
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
)
Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def __init__(
    self,
    engine_client: EngineClient,
    models: OpenAIServingModels,
    *,
    request_logger: RequestLogger | None,
    return_tokens_as_token_ids: bool = False,
    task_type: Literal["transcribe", "translate"] = "transcribe",
    log_error_stack: bool = False,
    enable_force_include_usage: bool = False,
):
    super().__init__(
        engine_client=engine_client,
        models=models,
        request_logger=request_logger,
        return_tokens_as_token_ids=return_tokens_as_token_ids,
        log_error_stack=log_error_stack,
    )

    self.default_sampling_params = self.model_config.get_diff_sampling_param()
    self.task_type: Final = task_type

    self.asr_config = self.model_cls.get_speech_to_text_config(
        self.model_config, task_type
    )

    self.enable_force_include_usage = enable_force_include_usage

    self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
    if self.model_cls.supports_segment_timestamp:
        self.tokenizer = cast(
            PreTrainedTokenizerBase,
            get_tokenizer(
                tokenizer_name=self.model_config.tokenizer,
                tokenizer_mode=self.model_config.tokenizer_mode,
            ),
        )

    if self.default_sampling_params:
        logger.info(
            "Overwriting default completion sampling param with: %s",
            self.default_sampling_params,
        )

    # Warm up audio preprocessing to avoid first-request latency
    self._warmup_audio_preprocessing()
    # Warm up input processor with dummy audio
    self._warmup_input_processor()

_create_speech_to_text async

_create_speech_to_text(
    audio_data: bytes,
    request: SpeechToTextRequest,
    raw_request: Request,
    response_class: type[ResponseType],
    stream_generator_method: Callable[
        ..., AsyncGenerator[str, None]
    ],
) -> T | V | AsyncGenerator[str, None] | ErrorResponse

Base method for speech-to-text operations like transcription and translation.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
async def _create_speech_to_text(
    self,
    audio_data: bytes,
    request: SpeechToTextRequest,
    raw_request: Request,
    response_class: type[ResponseType],
    stream_generator_method: Callable[..., AsyncGenerator[str, None]],
) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
    """Base method for speech-to-text operations like transcription and
    translation."""
    error_check_ret = await self._check_model(request)
    if error_check_ret is not None:
        return error_check_ret

    # If the engine is dead, raise the engine's DEAD_ERROR.
    # This is required for the streaming case, where we return a
    # success status before we actually start generating text :).
    if self.engine_client.errored:
        raise self.engine_client.dead_error

    if request.response_format not in ["text", "json", "verbose_json"]:
        return self.create_error_response(
            "Currently only support response_format: "
            "`text`, `json` or `verbose_json`"
        )

    if (
        request.response_format == "verbose_json"
        and not self.model_cls.supports_segment_timestamp
    ):
        return self.create_error_response(
            f"Currently do not support verbose_json for {request.model}"
        )

    if request.response_format == "verbose_json" and request.stream:
        return self.create_error_response(
            "verbose_json format doesn't support streaming case"
        )
    request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"

    request_metadata = RequestResponseMetadata(request_id=request_id)
    if raw_request:
        raw_request.state.request_metadata = request_metadata

    try:
        lora_request = self._maybe_get_adapters(request)

        prompts, duration_s = await self._preprocess_speech_to_text(
            request=request,
            audio_data=audio_data,
        )

    except ValueError as e:
        logger.exception("Error in preprocessing prompt inputs")
        return self.create_error_response(e)

    list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
    try:
        # Unlike most decoder-only models, whisper generation length is not
        # constrained by the size of the input audio, which is mapped to a
        # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
        # generated by respecting the extra completion tokens arg.
        if request.max_completion_tokens is None:
            default_max_tokens = self.model_config.max_model_len
        else:
            default_max_tokens = min(
                self.model_config.max_model_len, request.max_completion_tokens
            )
        sampling_params = request.to_sampling_params(
            default_max_tokens, self.default_sampling_params
        )
        if request.response_format == "verbose_json":
            sampling_params.logprobs = 1

        self._log_inputs(
            request_id,
            # It will not display special tokens like <|startoftranscript|>
            request.prompt,
            params=sampling_params,
            lora_request=lora_request,
        )

        list_result_generator = [
            self.engine_client.generate(
                prompt,
                sampling_params,
                f"{request_id}_{i}",
                lora_request=lora_request,
            )
            for i, prompt in enumerate(prompts)
        ]
    except ValueError as e:
        return self.create_error_response(e)

    if request.stream:
        return stream_generator_method(
            request, list_result_generator, request_id, request_metadata, duration_s
        )
    # Non-streaming response.
    total_segments = []
    text_parts = []
    try:
        assert list_result_generator is not None
        segments_types: dict[str, type[SpeechToTextSegment]] = {
            "transcribe": TranscriptionSegment,
            "translate": TranslationSegment,
        }
        segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
        text = ""
        chunk_size_in_s = self.asr_config.max_audio_clip_s
        if chunk_size_in_s is None:
            assert len(list_result_generator) == 1, (
                "`max_audio_clip_s` is set to None, audio cannot be chunked"
            )
        for idx, result_generator in enumerate(list_result_generator):
            start_time = (
                float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
            )
            async for op in result_generator:
                if request.response_format == "verbose_json":
                    assert op.outputs[0].logprobs
                    segments: list[SpeechToTextSegment] = (
                        self._get_verbose_segments(
                            tokens=tuple(op.outputs[0].token_ids),
                            segment_class=segment_class,
                            request=request,
                            start_time=start_time,
                            log_probs=op.outputs[0].logprobs,
                        )
                    )

                    total_segments.extend(segments)
                    text_parts.extend([seg.text for seg in segments])
                else:
                    raw_text = op.outputs[0].text
                    text_parts.append(self.model_cls.post_process_output(raw_text))
        text = "".join(text_parts)
        if self.task_type == "transcribe":
            final_response: ResponseType
            # add usage in TranscriptionResponse.
            usage = {
                "type": "duration",
                # rounded up as per openAI specs
                "seconds": int(math.ceil(duration_s)),
            }
            if request.response_format != "verbose_json":
                final_response = cast(
                    T, TranscriptionResponse(text=text, usage=usage)
                )
            else:
                final_response = cast(
                    V,
                    TranscriptionResponseVerbose(
                        text=text,
                        language=request.language,
                        duration=str(duration_s),
                        segments=total_segments,
                    ),
                )
        else:
            # no usage in response for translation task
            if request.response_format != "verbose_json":
                final_response = cast(T, TranslationResponse(text=text))
            else:
                final_response = cast(
                    V,
                    TranslationResponseVerbose(
                        text=text,
                        language=request.language,
                        duration=str(duration_s),
                        segments=total_segments,
                    ),
                )
        return final_response
    except asyncio.CancelledError:
        return self.create_error_response("Client disconnected")
    except ValueError as e:
        return self.create_error_response(e)

_find_split_point

_find_split_point(
    wav: ndarray, start_idx: int, end_idx: int
) -> int

Find the best point to split audio by looking for silence or low amplitude. Args: wav: Audio tensor [1, T] start_idx: Start index of search region end_idx: End index of search region Returns: Index of best splitting point

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
    """Find the best point to split audio by
    looking for silence or low amplitude.
    Args:
        wav: Audio tensor [1, T]
        start_idx: Start index of search region
        end_idx: End index of search region
    Returns:
        Index of best splitting point
    """
    segment = wav[start_idx:end_idx]

    # Calculate RMS energy in small windows
    min_energy = math.inf
    quietest_idx = 0
    min_energy_window = self.asr_config.min_energy_split_window_size
    assert min_energy_window is not None
    for i in range(0, len(segment) - min_energy_window, min_energy_window):
        window = segment[i : i + min_energy_window]
        energy = (window**2).mean() ** 0.5
        if energy < min_energy:
            quietest_idx = i + start_idx
            min_energy = energy
    return quietest_idx

_get_verbose_segments

_get_verbose_segments(
    tokens: tuple,
    log_probs: FlatLogprobs | list[dict[int, Logprob]],
    request: SpeechToTextRequest,
    segment_class: type[SpeechToTextSegment],
    start_time: float = 0,
) -> list[SpeechToTextSegment]

Convert tokens to verbose segments.

This method expects the model to produce timestamps as tokens (similar to Whisper). If the tokens do not include timestamp information, the segments may not be generated correctly.

Note: No_speech_prob field is not supported in this implementation and will be None. See docs for details.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _get_verbose_segments(
    self,
    tokens: tuple,
    log_probs: FlatLogprobs | list[dict[int, Logprob]],
    request: SpeechToTextRequest,
    segment_class: type[SpeechToTextSegment],
    start_time: float = 0,
) -> list[SpeechToTextSegment]:
    """
    Convert tokens to verbose segments.

    This method expects the model to produce
    timestamps as tokens (similar to Whisper).
    If the tokens do not include timestamp information,
    the segments may not be generated correctly.

    Note: No_speech_prob field is not supported
    in this implementation and will be None. See docs for details.
    """
    BASE_OFFSET = 0.02
    init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
    if tokens[-1] == self.tokenizer.eos_token_id:
        tokens = tokens[:-1]

    tokens_with_start = (init_token,) + tokens
    segments: list[SpeechToTextSegment] = []
    last_timestamp_start = 0

    if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
        tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
    avg_logprob = 0.0
    for idx in range(1, len(tokens_with_start)):
        # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
        # If the ordering is violated, this slicing may produce incorrect results.
        token = tokens_with_start[idx]
        if token >= init_token and tokens_with_start[idx - 1] >= init_token:
            sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
            start_timestamp = sliced_timestamp_tokens[0] - init_token
            end_timestamp = sliced_timestamp_tokens[-1] - init_token
            text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
            text_bytes = text.encode("utf-8")

            casting_segment = cast(
                SpeechToTextSegment,
                segment_class(
                    id=len(segments),
                    seek=start_time,
                    start=start_time + BASE_OFFSET * start_timestamp,
                    end=start_time + BASE_OFFSET * end_timestamp,
                    temperature=request.temperature,
                    text=text,
                    # The compression ratio measures
                    # how compressible the generated text is.
                    # A higher ratio indicates more repetitive content,
                    # which is a strong sign of hallucination in outputs.
                    compression_ratio=len(text_bytes)
                    / len(zlib.compress(text_bytes)),
                    tokens=sliced_timestamp_tokens[1:-1],
                    avg_logprob=avg_logprob / (idx - last_timestamp_start),
                ),
            )
            segments.append(casting_segment)
            last_timestamp_start = idx
            avg_logprob = 0
        else:
            avg_logprob += log_probs[idx - 1][token].logprob
    return segments

_preprocess_speech_to_text async

_preprocess_speech_to_text(
    request: SpeechToTextRequest, audio_data: bytes
) -> tuple[list[PromptType], float]
Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
async def _preprocess_speech_to_text(
    self,
    request: SpeechToTextRequest,
    audio_data: bytes,
) -> tuple[list[PromptType], float]:
    # Validate request
    language = self.model_cls.validate_language(request.language)
    # Skip to_language validation to avoid extra logging for Whisper.
    to_language = (
        self.model_cls.validate_language(request.to_language)
        if request.to_language
        else None
    )

    if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
        raise VLLMValidationError(
            "Maximum file size exceeded",
            parameter="audio_filesize_mb",
            value=len(audio_data) / 1024**2,
        )

    with io.BytesIO(audio_data) as bytes_:
        # NOTE resample to model SR here for efficiency. This is also a
        # pre-requisite for chunking, as it assumes Whisper SR.
        y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)

    duration = librosa.get_duration(y=y, sr=sr)
    do_split_audio = (
        self.asr_config.allow_audio_chunking
        and duration > self.asr_config.max_audio_clip_s
    )
    chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
    prompts = []
    for chunk in chunks:
        # The model has control over the construction, as long as it
        # returns a valid PromptType.
        prompt = self.model_cls.get_generation_prompt(
            audio=chunk,
            stt_config=self.asr_config,
            model_config=self.model_config,
            language=language,
            task_type=self.task_type,
            request_prompt=request.prompt,
            to_language=to_language,
        )
        if request.response_format == "verbose_json":
            prompt = self._preprocess_verbose_prompt(parse_enc_dec_prompt(prompt))

        prompts.append(prompt)

    return prompts, duration

_preprocess_verbose_prompt

_preprocess_verbose_prompt(
    prompt: EncoderDecoderDictPrompt,
)
Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _preprocess_verbose_prompt(self, prompt: EncoderDecoderDictPrompt):
    dec_prompt = prompt["decoder_prompt"]

    if not (isinstance(dec_prompt, dict) and "prompt" in dec_prompt):
        raise VLLMValidationError(
            "Expected decoder_prompt to contain text",
            parameter="decoder_prompt",
            value=type(dec_prompt).__name__,
        )

    dec_prompt["prompt"] = dec_prompt["prompt"].replace(
        "<|notimestamps|>", "<|0.00|>"
    )

    return prompt

_speech_to_text_stream_generator async

_speech_to_text_stream_generator(
    request: SpeechToTextRequest,
    list_result_generator: list[
        AsyncGenerator[RequestOutput, None]
    ],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
    chunk_object_type: Literal[
        "translation.chunk", "transcription.chunk"
    ],
    response_stream_choice_class: type[
        TranscriptionResponseStreamChoice
    ]
    | type[TranslationResponseStreamChoice],
    stream_response_class: type[TranscriptionStreamResponse]
    | type[TranslationStreamResponse],
) -> AsyncGenerator[str, None]
Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
async def _speech_to_text_stream_generator(
    self,
    request: SpeechToTextRequest,
    list_result_generator: list[AsyncGenerator[RequestOutput, None]],
    request_id: str,
    request_metadata: RequestResponseMetadata,
    audio_duration_s: float,
    chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
    response_stream_choice_class: type[TranscriptionResponseStreamChoice]
    | type[TranslationResponseStreamChoice],
    stream_response_class: type[TranscriptionStreamResponse]
    | type[TranslationStreamResponse],
) -> AsyncGenerator[str, None]:
    created_time = int(time.time())
    model_name = request.model

    completion_tokens = 0
    num_prompt_tokens = 0

    include_usage = self.enable_force_include_usage or request.stream_include_usage
    include_continuous_usage = (
        request.stream_continuous_usage_stats
        if include_usage and request.stream_continuous_usage_stats
        else False
    )

    try:
        for result_generator in list_result_generator:
            async for res in result_generator:
                # On first result.
                if res.prompt_token_ids is not None:
                    num_prompt_tokens = len(res.prompt_token_ids)
                    if audio_tokens := self.model_cls.get_num_audio_tokens(
                        audio_duration_s, self.asr_config, self.model_config
                    ):
                        num_prompt_tokens += audio_tokens

                # We need to do it here, because if there are exceptions in
                # the result_generator, it needs to be sent as the FIRST
                # response (by the try...catch).

                # Just one output (n=1) supported.
                assert len(res.outputs) == 1
                output = res.outputs[0]

                # TODO: For models that output structured formats (e.g.,
                # Qwen3-ASR with "language X<asr_text>" prefix), streaming
                # would need buffering to strip the prefix properly since
                # deltas may split the tag across chunks.
                delta_message = DeltaMessage(content=output.text)
                completion_tokens += len(output.token_ids)

                if output.finish_reason is None:
                    # Still generating, send delta update.
                    choice_data = response_stream_choice_class(delta=delta_message)
                else:
                    # Model is finished generating.
                    choice_data = response_stream_choice_class(
                        delta=delta_message,
                        finish_reason=output.finish_reason,
                        stop_reason=output.stop_reason,
                    )

                chunk = stream_response_class(
                    id=request_id,
                    object=chunk_object_type,
                    created=created_time,
                    choices=[choice_data],
                    model=model_name,
                )

                # handle usage stats if requested & if continuous
                if include_continuous_usage:
                    chunk.usage = UsageInfo(
                        prompt_tokens=num_prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=num_prompt_tokens + completion_tokens,
                    )

                data = chunk.model_dump_json(exclude_unset=True)
                yield f"data: {data}\n\n"

        # Once the final token is handled, if stream_options.include_usage
        # is sent, send the usage.
        if include_usage:
            final_usage = UsageInfo(
                prompt_tokens=num_prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=num_prompt_tokens + completion_tokens,
            )

            final_usage_chunk = stream_response_class(
                id=request_id,
                object=chunk_object_type,
                created=created_time,
                choices=[],
                model=model_name,
                usage=final_usage,
            )
            final_usage_data = final_usage_chunk.model_dump_json(
                exclude_unset=True, exclude_none=True
            )
            yield f"data: {final_usage_data}\n\n"

        # report to FastAPI middleware aggregate usage across all choices
        request_metadata.final_usage_info = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=num_prompt_tokens + completion_tokens,
        )

    except Exception as e:
        logger.exception("Error in %s stream generator.", self.task_type)
        data = self.create_streaming_error_response(e)
        yield f"data: {data}\n\n"
    # Send the final done message after all response.n are finished
    yield "data: [DONE]\n\n"

_split_audio

_split_audio(
    audio_data: ndarray, sample_rate: int
) -> list[ndarray]
Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _split_audio(
    self, audio_data: np.ndarray, sample_rate: int
) -> list[np.ndarray]:
    assert self.asr_config.max_audio_clip_s is not None, (
        f"{self.asr_config.max_audio_clip_s=} cannot be None to"
        " split audio into chunks."
    )
    chunk_size = sample_rate * self.asr_config.max_audio_clip_s
    overlap_size = sample_rate * self.asr_config.overlap_chunk_second
    chunks = []
    i = 0
    while i < audio_data.shape[-1]:
        if i + chunk_size >= audio_data.shape[-1]:
            # handle last chunk
            chunks.append(audio_data[..., i:])
            break

        # Find the best split point in the overlap region
        search_start = i + chunk_size - overlap_size
        search_end = min(i + chunk_size, audio_data.shape[-1])
        split_point = self._find_split_point(audio_data, search_start, search_end)

        # Extract chunk up to the split point
        chunks.append(audio_data[..., i:split_point])
        i = split_point
    return chunks

_warmup_audio_preprocessing

_warmup_audio_preprocessing() -> None

Warm up audio processing libraries to avoid first-request latency.

The first call to librosa functions (load, get_duration, mel-spectrogram) triggers JIT compilation and library initialization which can take ~7s. This method warms up these operations during server initialization.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _warmup_audio_preprocessing(self) -> None:
    """Warm up audio processing libraries to avoid first-request latency.

    The first call to librosa functions (load, get_duration, mel-spectrogram)
    triggers JIT compilation and library initialization which can take ~7s.
    This method warms up these operations during server initialization.
    """
    # Skip warmup if librosa is not installed (optional dependency)
    if isinstance(librosa, PlaceholderModule):
        return

    # Skip warmup if model doesn't support transcription
    if not supports_transcription(self.model_cls):
        return

    if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
        return

    try:
        warmup_start = time.perf_counter()
        logger.info("Warming up audio preprocessing libraries...")

        # Create a minimal dummy audio (1 second of silence at target sample rate)
        dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

        # Warm up librosa.load by using librosa functions on the dummy data
        # This initializes FFTW, numba JIT, and other audio processing libraries
        _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)

        # Warm up mel-spectrogram computation with model-specific parameters
        from vllm.transformers_utils.processor import cached_processor_from_config

        processor = cached_processor_from_config(self.model_config)
        feature_extractor = None
        if hasattr(processor, "feature_extractor"):
            feature_extractor = processor.feature_extractor
        elif hasattr(processor, "audio_processor"):
            # For models like GraniteSpeech that use audio_processor
            audio_proc = processor.audio_processor
            if hasattr(audio_proc, "feature_extractor"):
                feature_extractor = audio_proc.feature_extractor
            # If audio_processor doesn't have feature_extractor,
            # skip mel-spectrogram warmup for these models

        if feature_extractor is not None:
            _ = librosa.feature.melspectrogram(
                y=dummy_audio,
                sr=self.asr_config.sample_rate,
                n_mels=getattr(feature_extractor, "n_mels", 128),
                n_fft=getattr(feature_extractor, "n_fft", 400),
                hop_length=getattr(feature_extractor, "hop_length", 160),
            )

        warmup_elapsed = time.perf_counter() - warmup_start
        logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
    except Exception:
        # Don't fail initialization if warmup fails - log exception and continue
        logger.exception(
            "Audio preprocessing warmup failed (non-fatal): %s. "
            "First request may experience higher latency.",
        )

_warmup_input_processor

_warmup_input_processor() -> None

Warm up input processor with dummy audio to avoid first-request latency.

The first call to input_processor.process_inputs() with multimodal audio triggers multimodal processing initialization which can take ~2.5s. This method processes a dummy audio request to warm up the pipeline.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py
def _warmup_input_processor(self) -> None:
    """Warm up input processor with dummy audio to avoid first-request latency.

    The first call to input_processor.process_inputs() with multimodal audio
    triggers multimodal processing initialization which can take ~2.5s.
    This method processes a dummy audio request to warm up the pipeline.
    """
    # Skip warmup if model doesn't support transcription
    if not supports_transcription(self.model_cls):
        return

    # Only warm up if model supports transcription methods
    if not hasattr(self.model_cls, "get_generation_prompt"):
        return

    try:
        from vllm.sampling_params import SamplingParams

        warmup_start = time.perf_counter()
        logger.info("Warming up multimodal input processor...")

        # Create minimal dummy audio (1 second of silence)
        dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

        # Use the same method that _preprocess_speech_to_text uses
        # to create the prompt
        dummy_prompt = self.model_cls.get_generation_prompt(
            audio=dummy_audio,
            stt_config=self.asr_config,
            model_config=self.model_config,
            language="en",
            task_type=self.task_type,
            request_prompt="",
            to_language=None,
        )

        # Create minimal sampling params
        dummy_params = SamplingParams(
            max_tokens=1,
            temperature=0.0,
            skip_clone=True,  # Internal warmup, safe to skip clone
        )

        # Process the dummy input through the input processor
        # This will trigger all the multimodal processing initialization
        _ = self.input_processor.process_inputs(
            request_id="warmup",
            prompt=dummy_prompt,
            params=dummy_params,
        )

        warmup_elapsed = time.perf_counter() - warmup_start
        logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
    except Exception:
        # Don't fail initialization if warmup fails - log warning and continue
        logger.exception(
            "Input processor warmup failed (non-fatal): %s. "
            "First request may experience higher latency."
        )

RequestLogger

Source code in vllm/entrypoints/logger.py
class RequestLogger:
    def __init__(self, *, max_log_len: int | None) -> None:
        self.max_log_len = max_log_len

    def log_inputs(
        self,
        request_id: str,
        prompt: str | None,
        prompt_token_ids: list[int] | None,
        prompt_embeds: torch.Tensor | None,
        params: SamplingParams | PoolingParams | BeamSearchParams | None,
        lora_request: LoRARequest | None,
    ) -> None:
        if logger.isEnabledFor(logging.DEBUG):
            max_log_len = self.max_log_len
            if max_log_len is not None:
                if prompt is not None:
                    prompt = prompt[:max_log_len]

                if prompt_token_ids is not None:
                    prompt_token_ids = prompt_token_ids[:max_log_len]

            logger.debug(
                "Request %s details: prompt: %r, "
                "prompt_token_ids: %s, "
                "prompt_embeds shape: %s.",
                request_id,
                prompt,
                prompt_token_ids,
                prompt_embeds.shape if prompt_embeds is not None else None,
            )

        logger.info(
            "Received request %s: params: %s, lora_request: %s.",
            request_id,
            params,
            lora_request,
        )

    def log_outputs(
        self,
        request_id: str,
        outputs: str,
        output_token_ids: Sequence[int] | None,
        finish_reason: str | None = None,
        is_streaming: bool = False,
        delta: bool = False,
    ) -> None:
        max_log_len = self.max_log_len
        if max_log_len is not None:
            if outputs is not None:
                outputs = outputs[:max_log_len]

            if output_token_ids is not None:
                # Convert to list and apply truncation
                output_token_ids = list(output_token_ids)[:max_log_len]

        stream_info = ""
        if is_streaming:
            stream_info = " (streaming delta)" if delta else " (streaming complete)"

        logger.info(
            "Generated response %s%s: output: %r, "
            "output_token_ids: %s, finish_reason: %s",
            request_id,
            stream_info,
            outputs,
            output_token_ids,
            finish_reason,
        )

max_log_len instance-attribute

max_log_len = max_log_len

__init__

__init__(*, max_log_len: int | None) -> None
Source code in vllm/entrypoints/logger.py
def __init__(self, *, max_log_len: int | None) -> None:
    self.max_log_len = max_log_len

log_inputs

log_inputs(
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_embeds: Tensor | None,
    params: SamplingParams
    | PoolingParams
    | BeamSearchParams
    | None,
    lora_request: LoRARequest | None,
) -> None
Source code in vllm/entrypoints/logger.py
def log_inputs(
    self,
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_embeds: torch.Tensor | None,
    params: SamplingParams | PoolingParams | BeamSearchParams | None,
    lora_request: LoRARequest | None,
) -> None:
    if logger.isEnabledFor(logging.DEBUG):
        max_log_len = self.max_log_len
        if max_log_len is not None:
            if prompt is not None:
                prompt = prompt[:max_log_len]

            if prompt_token_ids is not None:
                prompt_token_ids = prompt_token_ids[:max_log_len]

        logger.debug(
            "Request %s details: prompt: %r, "
            "prompt_token_ids: %s, "
            "prompt_embeds shape: %s.",
            request_id,
            prompt,
            prompt_token_ids,
            prompt_embeds.shape if prompt_embeds is not None else None,
        )

    logger.info(
        "Received request %s: params: %s, lora_request: %s.",
        request_id,
        params,
        lora_request,
    )

log_outputs

log_outputs(
    request_id: str,
    outputs: str,
    output_token_ids: Sequence[int] | None,
    finish_reason: str | None = None,
    is_streaming: bool = False,
    delta: bool = False,
) -> None
Source code in vllm/entrypoints/logger.py
def log_outputs(
    self,
    request_id: str,
    outputs: str,
    output_token_ids: Sequence[int] | None,
    finish_reason: str | None = None,
    is_streaming: bool = False,
    delta: bool = False,
) -> None:
    max_log_len = self.max_log_len
    if max_log_len is not None:
        if outputs is not None:
            outputs = outputs[:max_log_len]

        if output_token_ids is not None:
            # Convert to list and apply truncation
            output_token_ids = list(output_token_ids)[:max_log_len]

    stream_info = ""
    if is_streaming:
        stream_info = " (streaming delta)" if delta else " (streaming complete)"

    logger.info(
        "Generated response %s%s: output: %r, "
        "output_token_ids: %s, finish_reason: %s",
        request_id,
        stream_info,
        outputs,
        output_token_ids,
        finish_reason,
    )

RequestOutput

The output data of a completion request to the LLM.

Parameters:

Name Type Description Default
request_id str

The unique ID of the request.

required
prompt str | None

The prompt string of the request. For encoder/decoder models, this is the decoder input prompt.

required
prompt_token_ids list[int] | None

The token IDs of the prompt. For encoder/decoder models, this is the decoder input prompt token ids.

required
prompt_logprobs PromptLogprobs | None

The log probabilities to return per prompt token.

required
outputs list[CompletionOutput]

The output sequences of the request.

required
finished bool

Whether the whole request is finished.

required
metrics RequestStateStats | None

Metrics associated with the request.

None
lora_request LoRARequest | None

The LoRA request that was used to generate the output.

None
encoder_prompt str | None

The encoder prompt string of the request. None if decoder-only.

None
encoder_prompt_token_ids list[int] | None

The token IDs of the encoder prompt. None if decoder-only.

None
num_cached_tokens int | None

The number of tokens with prefix cache hit.

None
kv_transfer_params dict[str, Any] | None

The params for remote K/V transfer.

None
Source code in vllm/outputs.py
class RequestOutput:
    """The output data of a completion request to the LLM.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt string of the request.
                For encoder/decoder models, this is the
                decoder input prompt.
        prompt_token_ids: The token IDs of the prompt.
                          For encoder/decoder models, this is the
                          decoder input prompt token ids.
        prompt_logprobs: The log probabilities to return per prompt token.
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
        metrics: Metrics associated with the request.
        lora_request: The LoRA request that was used to generate the output.
        encoder_prompt: The encoder prompt string of the request.
                        None if decoder-only.
        encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                  None if decoder-only.
        num_cached_tokens: The number of tokens with prefix cache hit.
        kv_transfer_params: The params for remote K/V transfer.
    """

    def __init__(
        self,
        request_id: str,
        prompt: str | None,
        prompt_token_ids: list[int] | None,
        prompt_logprobs: PromptLogprobs | None,
        outputs: list[CompletionOutput],
        finished: bool,
        metrics: RequestStateStats | None = None,
        lora_request: LoRARequest | None = None,
        encoder_prompt: str | None = None,
        encoder_prompt_token_ids: list[int] | None = None,
        num_cached_tokens: int | None = None,
        *,
        multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
        kv_transfer_params: dict[str, Any] | None = None,
        # Forward compatibility, code that uses args added in new release can
        # still run with older versions of vLLM without breaking.
        **kwargs: Any,
    ) -> None:
        if kwargs:
            logger.warning_once(
                "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
            )
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.multi_modal_placeholders = multi_modal_placeholders or {}
        self.prompt_logprobs = prompt_logprobs
        self.outputs = outputs
        self.finished = finished
        self.metrics = metrics
        self.lora_request = lora_request
        self.encoder_prompt = encoder_prompt
        self.encoder_prompt_token_ids = encoder_prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.kv_transfer_params = kv_transfer_params

    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
        """Merge subsequent RequestOutput into this one"""

        self.finished |= next_output.finished
        self.kv_transfer_params = next_output.kv_transfer_params

        for next_completion in next_output.outputs:
            for i, completion in enumerate(self.outputs):
                if completion.index == next_completion.index:
                    if aggregate:
                        # Merge outputs with same index
                        completion.text += next_completion.text
                        if not isinstance(completion.token_ids, MutableSequence):
                            completion.token_ids = list(completion.token_ids)
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
                            completion.logprobs.extend(next_completion.logprobs)
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob
                        )
                        completion.finish_reason = next_completion.finish_reason
                        completion.stop_reason = next_completion.stop_reason
                    else:
                        # Replace the output with the new one
                        self.outputs[i] = next_completion
                    break
            else:
                self.outputs.append(next_completion)

    def __repr__(self) -> str:
        return (
            f"RequestOutput(request_id={self.request_id}, "
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"encoder_prompt={self.encoder_prompt!r}, "
            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"outputs={self.outputs}, "
            f"finished={self.finished}, "
            f"metrics={self.metrics}, "
            f"lora_request={self.lora_request}, "
            f"num_cached_tokens={self.num_cached_tokens}, "
            f"multi_modal_placeholders={self.multi_modal_placeholders})"
        )

encoder_prompt instance-attribute

encoder_prompt = encoder_prompt

encoder_prompt_token_ids instance-attribute

encoder_prompt_token_ids = encoder_prompt_token_ids

finished instance-attribute

finished = finished

kv_transfer_params instance-attribute

kv_transfer_params = kv_transfer_params

lora_request instance-attribute

lora_request = lora_request

metrics instance-attribute

metrics = metrics

multi_modal_placeholders instance-attribute

multi_modal_placeholders = multi_modal_placeholders or {}

num_cached_tokens instance-attribute

num_cached_tokens = num_cached_tokens

outputs instance-attribute

outputs = outputs

prompt instance-attribute

prompt = prompt

prompt_logprobs instance-attribute

prompt_logprobs = prompt_logprobs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_logprobs: PromptLogprobs | None,
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: RequestStateStats | None = None,
    lora_request: LoRARequest | None = None,
    encoder_prompt: str | None = None,
    encoder_prompt_token_ids: list[int] | None = None,
    num_cached_tokens: int | None = None,
    *,
    multi_modal_placeholders: MultiModalPlaceholderDict
    | None = None,
    kv_transfer_params: dict[str, Any] | None = None,
    **kwargs: Any,
) -> None
Source code in vllm/outputs.py
def __init__(
    self,
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_logprobs: PromptLogprobs | None,
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: RequestStateStats | None = None,
    lora_request: LoRARequest | None = None,
    encoder_prompt: str | None = None,
    encoder_prompt_token_ids: list[int] | None = None,
    num_cached_tokens: int | None = None,
    *,
    multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
    kv_transfer_params: dict[str, Any] | None = None,
    # Forward compatibility, code that uses args added in new release can
    # still run with older versions of vLLM without breaking.
    **kwargs: Any,
) -> None:
    if kwargs:
        logger.warning_once(
            "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
        )
    self.request_id = request_id
    self.prompt = prompt
    self.prompt_token_ids = prompt_token_ids
    self.multi_modal_placeholders = multi_modal_placeholders or {}
    self.prompt_logprobs = prompt_logprobs
    self.outputs = outputs
    self.finished = finished
    self.metrics = metrics
    self.lora_request = lora_request
    self.encoder_prompt = encoder_prompt
    self.encoder_prompt_token_ids = encoder_prompt_token_ids
    self.num_cached_tokens = num_cached_tokens
    self.kv_transfer_params = kv_transfer_params

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (
        f"RequestOutput(request_id={self.request_id}, "
        f"prompt={self.prompt!r}, "
        f"prompt_token_ids={self.prompt_token_ids}, "
        f"encoder_prompt={self.encoder_prompt!r}, "
        f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
        f"prompt_logprobs={self.prompt_logprobs}, "
        f"outputs={self.outputs}, "
        f"finished={self.finished}, "
        f"metrics={self.metrics}, "
        f"lora_request={self.lora_request}, "
        f"num_cached_tokens={self.num_cached_tokens}, "
        f"multi_modal_placeholders={self.multi_modal_placeholders})"
    )

add

add(next_output: RequestOutput, aggregate: bool) -> None

Merge subsequent RequestOutput into this one

Source code in vllm/outputs.py
def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
    """Merge subsequent RequestOutput into this one"""

    self.finished |= next_output.finished
    self.kv_transfer_params = next_output.kv_transfer_params

    for next_completion in next_output.outputs:
        for i, completion in enumerate(self.outputs):
            if completion.index == next_completion.index:
                if aggregate:
                    # Merge outputs with same index
                    completion.text += next_completion.text
                    if not isinstance(completion.token_ids, MutableSequence):
                        completion.token_ids = list(completion.token_ids)
                    completion.token_ids.extend(next_completion.token_ids)
                    if next_completion.logprobs:
                        assert completion.logprobs is not None
                        completion.logprobs.extend(next_completion.logprobs)
                    completion.cumulative_logprob = (
                        next_completion.cumulative_logprob
                    )
                    completion.finish_reason = next_completion.finish_reason
                    completion.stop_reason = next_completion.stop_reason
                else:
                    # Replace the output with the new one
                    self.outputs[i] = next_completion
                break
        else:
            self.outputs.append(next_completion)

RequestResponseMetadata

Bases: BaseModel

Source code in vllm/entrypoints/openai/engine/protocol.py
class RequestResponseMetadata(BaseModel):
    request_id: str
    final_usage_info: UsageInfo | None = None

final_usage_info class-attribute instance-attribute

final_usage_info: UsageInfo | None = None

request_id instance-attribute

request_id: str

TranscriptionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranscriptionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranscription

    file: UploadFile
    """
    The audio file object (not file name) to transcribe, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: str | None = None
    """ID of the model to use.
    """

    language: str | None = None
    """The language of the input audio.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy and latency.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    ## TODO (varun) : Support if set to 0, certain thresholds are met !!

    timestamp_granularities: list[Literal["word", "segment"]] = Field(
        alias="timestamp_granularities[]", default=[]
    )
    """The timestamp granularities to populate for this transcription.

    `response_format` must be set `verbose_json` to use timestamp granularities.
    Either or both of these options are supported: `word`, or `segment`. Note:
    There is no additional latency for segment timestamps, but generating word
    timestamps incurs additional latency.
    """

    stream: bool | None = False
    """When set, it will enable output to be streamed in a similar fashion
    as the Chat Completion endpoint.
    """
    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: bool | None = False
    stream_continuous_usage_stats: bool | None = False

    vllm_xargs: dict[str, str | int | float] | None = Field(
        default=None,
        description=(
            "Additional request parameters with string or "
            "numeric values, used by custom extensions."
        ),
    )
    # --8<-- [end:transcription-extra-params]

    to_language: str | None = None
    """The language of the output audio we transcribe to.

    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """

    # --8<-- [start:transcription-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """

    top_p: float | None = None
    """Enables nucleus (top-p) sampling, where tokens are selected from the
    smallest possible set whose cumulative probability exceeds `p`.
    """

    top_k: int | None = None
    """Limits sampling to the `k` most probable tokens at each step."""

    min_p: float | None = None
    """Filters out tokens with a probability lower than `min_p`, ensuring a
    minimum likelihood threshold during sampling.
    """

    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    frequency_penalty: float | None = 0.0
    """The frequency penalty to use for sampling."""

    repetition_penalty: float | None = None
    """The repetition penalty to use for sampling."""

    presence_penalty: float | None = 0.0
    """The presence penalty to use for sampling."""

    max_completion_tokens: int | None = None
    """The maximum number of tokens to generate."""
    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_sampling_params(
        self, default_max_tokens: int, default_sampling_params: dict | None = None
    ) -> SamplingParams:
        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
            )
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
            )

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )

        return SamplingParams.from_optional(
            temperature=temperature,
            max_tokens=max_tokens,
            seed=self.seed,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            presence_penalty=self.presence_penalty,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            extra_args=self.vllm_xargs,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_transcription_request(cls, data):
        if isinstance(data.get("file"), str):
            raise HTTPException(
                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
                detail="Expected 'file' to be a file-like object, not 'str'.",
            )

        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            # Find which specific stream option was set
            invalid_param = next(
                (so for so in stream_opts if data.get(so, False)),
                "stream_include_usage",
            )
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter=invalid_param,
            )

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

file instance-attribute

file: UploadFile

The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

frequency_penalty class-attribute instance-attribute

frequency_penalty: float | None = 0.0

The frequency penalty to use for sampling.

language class-attribute instance-attribute

language: str | None = None

The language of the input audio.

Supplying the input language in ISO-639-1 format will improve accuracy and latency.

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: int | None = None

The maximum number of tokens to generate.

min_p class-attribute instance-attribute

min_p: float | None = None

Filters out tokens with a probability lower than min_p, ensuring a minimum likelihood threshold during sampling.

model class-attribute instance-attribute

model: str | None = None

ID of the model to use.

presence_penalty class-attribute instance-attribute

presence_penalty: float | None = 0.0

The presence penalty to use for sampling.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

repetition_penalty class-attribute instance-attribute

repetition_penalty: float | None = None

The repetition penalty to use for sampling.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: int | None = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: bool | None = False

When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: bool | None = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: bool | None = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

timestamp_granularities class-attribute instance-attribute

timestamp_granularities: list[
    Literal["word", "segment"]
] = Field(alias="timestamp_granularities[]", default=[])

The timestamp granularities to populate for this transcription.

response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.

to_language class-attribute instance-attribute

to_language: str | None = None

The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api.

top_k class-attribute instance-attribute

top_k: int | None = None

Limits sampling to the k most probable tokens at each step.

top_p class-attribute instance-attribute

top_p: float | None = None

Enables nucleus (top-p) sampling, where tokens are selected from the smallest possible set whose cumulative probability exceeds p.

vllm_xargs class-attribute instance-attribute

vllm_xargs: dict[str, str | int | float] | None = Field(
    default=None,
    description="Additional request parameters with string or numeric values, used by custom extensions.",
)

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: dict | None = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
def to_sampling_params(
    self, default_max_tokens: int, default_sampling_params: dict | None = None
) -> SamplingParams:
    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}

    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
        )
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
        )
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
        )
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
        )

    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )

    return SamplingParams.from_optional(
        temperature=temperature,
        max_tokens=max_tokens,
        seed=self.seed,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        presence_penalty=self.presence_penalty,
        output_kind=RequestOutputKind.DELTA
        if self.stream
        else RequestOutputKind.FINAL_ONLY,
        extra_args=self.vllm_xargs,
        skip_clone=True,  # Created fresh per request, safe to skip clone
    )

validate_transcription_request classmethod

validate_transcription_request(data)
Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
@model_validator(mode="before")
@classmethod
def validate_transcription_request(cls, data):
    if isinstance(data.get("file"), str):
        raise HTTPException(
            status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
            detail="Expected 'file' to be a file-like object, not 'str'.",
        )

    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        # Find which specific stream option was set
        invalid_param = next(
            (so for so in stream_opts if data.get(so, False)),
            "stream_include_usage",
        )
        raise VLLMValidationError(
            "Stream options can only be defined when `stream=True`.",
            parameter=invalid_param,
        )

    return data

TranscriptionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
    usage: TranscriptionUsageAudio

text instance-attribute

text: str

The transcribed text.

usage instance-attribute

TranscriptionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranscriptionResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: str | None = None
    stop_reason: int | str | None = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: str | None = None

stop_reason class-attribute instance-attribute

stop_reason: int | str | None = None

TranscriptionResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranscriptionResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The transcribed text."""

    segments: list[TranscriptionSegment] | None = None
    """Segments of the transcribed text and their corresponding details."""

    words: list[TranscriptionWord] | None = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: list[TranscriptionSegment] | None = None

Segments of the transcribed text and their corresponding details.

text instance-attribute

text: str

The transcribed text.

words class-attribute instance-attribute

words: list[TranscriptionWord] | None = None

Extracted words and their corresponding timestamps.

TranscriptionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranscriptionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
    object: Literal["transcription.chunk"] = "transcription.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranscriptionResponseStreamChoice]
    usage: UsageInfo | None = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsc-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["transcription.chunk"] = (
    "transcription.chunk"
)

usage class-attribute instance-attribute

usage: UsageInfo | None = Field(default=None)

TranslationRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranslationRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranslation

    file: UploadFile
    """
    The audio file object (not file name) to translate, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: str | None = None
    """ID of the model to use.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    # TODO support additional sampling parameters
    # --8<-- [start:translation-sampling-params]
    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """
    # --8<-- [end:translation-sampling-params]

    # --8<-- [start:translation-extra-params]
    language: str | None = None
    """The language of the input audio we translate from.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy.
    """

    to_language: str | None = None
    """The language of the input audio we translate to.

    Please note that this is not supported by all models, refer to the specific
    model documentation for more details.
    For instance, Whisper only supports `to_language=en`.
    """

    stream: bool | None = False
    """Custom field not present in the original OpenAI definition. When set,
    it will enable output to be streamed in a similar fashion as the Chat
    Completion endpoint.
    """
    # Flattened stream option to simplify form data.
    stream_include_usage: bool | None = False
    stream_continuous_usage_stats: bool | None = False

    max_completion_tokens: int | None = None
    """The maximum number of tokens to generate."""
    # --8<-- [end:translation-extra-params]

    # Default sampling parameters for translation requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "temperature": 0,
    }

    def to_sampling_params(
        self, default_max_tokens: int, default_sampling_params: dict | None = None
    ) -> SamplingParams:
        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )

        return SamplingParams.from_optional(
            temperature=temperature,
            max_tokens=max_tokens,
            seed=self.seed,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            # Find which specific stream option was set
            invalid_param = next(
                (so for so in stream_opts if data.get(so, False)),
                "stream_include_usage",
            )
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter=invalid_param,
            )

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {'temperature': 0}

file instance-attribute

file: UploadFile

The audio file object (not file name) to translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

language class-attribute instance-attribute

language: str | None = None

The language of the input audio we translate from.

Supplying the input language in ISO-639-1 format will improve accuracy.

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: int | None = None

The maximum number of tokens to generate.

model class-attribute instance-attribute

model: str | None = None

ID of the model to use.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: int | None = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: bool | None = False

Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: bool | None = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: bool | None = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

to_language class-attribute instance-attribute

to_language: str | None = None

The language of the input audio we translate to.

Please note that this is not supported by all models, refer to the specific model documentation for more details. For instance, Whisper only supports to_language=en.

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: dict | None = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
def to_sampling_params(
    self, default_max_tokens: int, default_sampling_params: dict | None = None
) -> SamplingParams:
    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}
    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
        )

    return SamplingParams.from_optional(
        temperature=temperature,
        max_tokens=max_tokens,
        seed=self.seed,
        output_kind=RequestOutputKind.DELTA
        if self.stream
        else RequestOutputKind.FINAL_ONLY,
        skip_clone=True,  # Created fresh per request, safe to skip clone
    )

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        # Find which specific stream option was set
        invalid_param = next(
            (so for so in stream_opts if data.get(so, False)),
            "stream_include_usage",
        )
        raise VLLMValidationError(
            "Stream options can only be defined when `stream=True`.",
            parameter=invalid_param,
        )

    return data

TranslationResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranslationResponse(OpenAIBaseModel):
    text: str
    """The translated text."""

text instance-attribute

text: str

The translated text.

TranslationResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranslationResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: str | None = None
    stop_reason: int | str | None = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: str | None = None

stop_reason class-attribute instance-attribute

stop_reason: int | str | None = None

TranslationResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranslationResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The translated text."""

    segments: list[TranslationSegment] | None = None
    """Segments of the translated text and their corresponding details."""

    words: list[TranslationWord] | None = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: list[TranslationSegment] | None = None

Segments of the translated text and their corresponding details.

text instance-attribute

text: str

The translated text.

words class-attribute instance-attribute

words: list[TranslationWord] | None = None

Extracted words and their corresponding timestamps.

TranslationStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py
class TranslationStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
    object: Literal["translation.chunk"] = "translation.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranslationResponseStreamChoice]
    usage: UsageInfo | None = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['translation.chunk'] = 'translation.chunk'

usage class-attribute instance-attribute

usage: UsageInfo | None = Field(default=None)

init_logger

init_logger(name: str) -> _VllmLogger

The main purpose of this function is to ensure that loggers are retrieved in such a way that we can be sure the root vllm logger has already been configured.

Source code in vllm/logger.py
def init_logger(name: str) -> _VllmLogger:
    """The main purpose of this function is to ensure that loggers are
    retrieved in such a way that we can be sure the root vllm logger has
    already been configured."""

    logger = logging.getLogger(name)

    for method_name, method in _METHODS_TO_PATCH.items():
        setattr(logger, method_name, MethodType(method, logger))

    return cast(_VllmLogger, logger)