Skip to content

vllm.entrypoints.speech_to_text.transcription.protocol

TranscriptionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/speech_to_text/transcription/protocol.py
class TranscriptionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranscription

    file: UploadFile
    """
    The audio file object (not file name) to transcribe, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: str | None = None
    """ID of the model to use.
    """

    language: str | None = None
    """The language of the input audio.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy and latency.
    """

    hotwords: str | None = None
    """
    hotwords refers to a list of important words or phrases that the model
    should pay extra attention to during transcription.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    ## TODO (varun) : Support if set to 0, certain thresholds are met !!

    timestamp_granularities: list[Literal["word", "segment"]] = Field(
        alias="timestamp_granularities[]", default=[]
    )
    """The timestamp granularities to populate for this transcription.

    `response_format` must be set `verbose_json` to use timestamp granularities.
    Either or both of these options are supported: `word`, or `segment`. Note:
    There is no additional latency for segment timestamps, but generating word
    timestamps incurs additional latency.
    """

    stream: bool | None = False
    """When set, it will enable output to be streamed in a similar fashion
    as the Chat Completion endpoint.
    """
    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: bool | None = False
    stream_continuous_usage_stats: bool | None = False

    vllm_xargs: dict[str, str | int | float | bool] | None = Field(
        default=None,
        description=(
            "Additional request parameters with string or "
            "numeric values, used by custom extensions."
        ),
    )
    # --8<-- [end:transcription-extra-params]

    to_language: str | None = None
    """The language of the output audio we transcribe to.

    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """

    # --8<-- [start:transcription-sampling-params]
    use_beam_search: bool = False
    """Whether or not beam search should be used."""

    n: int = 1
    """The number of beams to be used in beam search."""

    length_penalty: float = 1.0
    """Length penalty to be used for beam search."""

    include_stop_str_in_output: bool = False
    """Whether to include the stop strings in output text."""

    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """

    top_p: float | None = None
    """Enables nucleus (top-p) sampling, where tokens are selected from the
    smallest possible set whose cumulative probability exceeds `p`.
    """

    top_k: int | None = None
    """Limits sampling to the `k` most probable tokens at each step."""

    min_p: float | None = None
    """Filters out tokens with a probability lower than `min_p`, ensuring a
    minimum likelihood threshold during sampling.
    """

    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    frequency_penalty: float | None = 0.0
    """The frequency penalty to use for sampling."""

    repetition_penalty: float | None = None
    """The repetition penalty to use for sampling."""

    presence_penalty: float | None = 0.0
    """The presence penalty to use for sampling."""

    max_completion_tokens: int | None = None
    """The maximum number of tokens to generate."""
    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def build_stt_params(
        self,
        audio: "np.ndarray",
        stt_config: "SpeechToTextConfig",
        model_config: "ModelConfig",
        task_type: str,
    ) -> SpeechToTextParams:
        return SpeechToTextParams(
            audio=audio,
            stt_config=stt_config,
            model_config=model_config,
            language=self.language,
            task_type=task_type,
            request_prompt=self.prompt,
            to_language=self.to_language,
            hotwords=self.hotwords,
        )

    def to_beam_search_params(
        self,
        default_max_tokens: int,
        default_sampling_params: dict | None = None,
    ) -> BeamSearchParams:
        if default_sampling_params is None:
            default_sampling_params = {}

        max_tokens = default_max_tokens
        n = self.n if self.n is not None else 1

        # NOTE: Temp 0 is a different fallback than completions
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 0)

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self, default_max_tokens: int, default_sampling_params: dict | None = None
    ) -> SamplingParams:
        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
            )
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
            )

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )

        return SamplingParams.from_optional(
            temperature=temperature,
            max_tokens=max_tokens,
            seed=self.seed,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            presence_penalty=self.presence_penalty,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            extra_args=self.vllm_xargs,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_transcription_request(cls, data):
        if isinstance(data.get("file"), str):
            raise HTTPException(
                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
                detail="Expected 'file' to be a file-like object, not 'str'.",
            )

        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            # Find which specific stream option was set
            invalid_param = next(
                (so for so in stream_opts if data.get(so, False)),
                "stream_include_usage",
            )
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter=invalid_param,
            )

        # Parse vllm_xargs from JSON string (form data sends it as a string)
        xargs = data.get("vllm_xargs")
        if isinstance(xargs, str):
            try:
                data["vllm_xargs"] = json.loads(xargs)
            except json.JSONDecodeError as e:
                raise VLLMValidationError(
                    f"Failed to parse vllm_xargs. Must be valid JSON: {e}",
                    parameter="vllm_xargs",
                ) from e

        return data

file instance-attribute

file: UploadFile

The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

frequency_penalty class-attribute instance-attribute

frequency_penalty: float | None = 0.0

The frequency penalty to use for sampling.

hotwords class-attribute instance-attribute

hotwords: str | None = None

hotwords refers to a list of important words or phrases that the model should pay extra attention to during transcription.

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

Whether to include the stop strings in output text.

language class-attribute instance-attribute

language: str | None = None

The language of the input audio.

Supplying the input language in ISO-639-1 format will improve accuracy and latency.

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

Length penalty to be used for beam search.

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: int | None = None

The maximum number of tokens to generate.

min_p class-attribute instance-attribute

min_p: float | None = None

Filters out tokens with a probability lower than min_p, ensuring a minimum likelihood threshold during sampling.

model class-attribute instance-attribute

model: str | None = None

ID of the model to use.

n class-attribute instance-attribute

n: int = 1

The number of beams to be used in beam search.

presence_penalty class-attribute instance-attribute

presence_penalty: float | None = 0.0

The presence penalty to use for sampling.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

repetition_penalty class-attribute instance-attribute

repetition_penalty: float | None = None

The repetition penalty to use for sampling.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: int | None = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: bool | None = False

When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

timestamp_granularities class-attribute instance-attribute

timestamp_granularities: list[
    Literal["word", "segment"]
] = Field(alias="timestamp_granularities[]", default=[])

The timestamp granularities to populate for this transcription.

response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.

to_language class-attribute instance-attribute

to_language: str | None = None

The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api.

top_k class-attribute instance-attribute

top_k: int | None = None

Limits sampling to the k most probable tokens at each step.

top_p class-attribute instance-attribute

top_p: float | None = None

Enables nucleus (top-p) sampling, where tokens are selected from the smallest possible set whose cumulative probability exceeds p.

use_beam_search: bool = False

Whether or not beam search should be used.

TranscriptionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/speech_to_text/transcription/protocol.py
class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
    usage: TranscriptionUsageAudio

text instance-attribute

text: str

The transcribed text.

TranscriptionResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/speech_to_text/transcription/protocol.py
class TranscriptionResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The transcribed text."""

    segments: list[TranscriptionSegment] | None = None
    """Segments of the transcribed text and their corresponding details."""

    words: list[TranscriptionWord] | None = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: list[TranscriptionSegment] | None = None

Segments of the transcribed text and their corresponding details.

text instance-attribute

text: str

The transcribed text.

words class-attribute instance-attribute

words: list[TranscriptionWord] | None = None

Extracted words and their corresponding timestamps.

TranscriptionSegment

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/speech_to_text/transcription/protocol.py
class TranscriptionSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""

    avg_logprob: float
    """Average logprob of the segment.

    If the value is lower than -1, consider the logprobs failed.
    """

    compression_ratio: float
    """Compression ratio of the segment.

    If the value is greater than 2.4, consider the compression failed.
    """

    end: float
    """End time of the segment in seconds."""

    no_speech_prob: float | None = None
    """Probability of no speech in the segment.

    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
    this segment silent.
    """

    seek: int
    """Seek offset of the segment."""

    start: float
    """Start time of the segment in seconds."""

    temperature: float
    """Temperature parameter used for generating the segment."""

    text: str
    """Text content of the segment."""

    tokens: list[int]
    """Array of token IDs for the text content."""

avg_logprob instance-attribute

avg_logprob: float

Average logprob of the segment.

If the value is lower than -1, consider the logprobs failed.

compression_ratio instance-attribute

compression_ratio: float

Compression ratio of the segment.

If the value is greater than 2.4, consider the compression failed.

end instance-attribute

end: float

End time of the segment in seconds.

id instance-attribute

id: int

Unique identifier of the segment.

no_speech_prob class-attribute instance-attribute

no_speech_prob: float | None = None

Probability of no speech in the segment.

If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.

seek instance-attribute

seek: int

Seek offset of the segment.

start instance-attribute

start: float

Start time of the segment in seconds.

temperature instance-attribute

temperature: float

Temperature parameter used for generating the segment.

text instance-attribute

text: str

Text content of the segment.

tokens instance-attribute

tokens: list[int]

Array of token IDs for the text content.

TranscriptionWord

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/speech_to_text/transcription/protocol.py
class TranscriptionWord(OpenAIBaseModel):
    end: float
    """End time of the word in seconds."""

    start: float
    """Start time of the word in seconds."""

    word: str
    """The text content of the word."""

end instance-attribute

end: float

End time of the word in seconds.

start instance-attribute

start: float

Start time of the word in seconds.

word instance-attribute

word: str

The text content of the word.