Skip to content

vllm.entrypoints.openai.protocol

AnyResponseFormat module-attribute

AudioResponseFormat module-attribute

AudioResponseFormat: TypeAlias = Literal[
    "json", "text", "srt", "verbose_json", "vtt"
]

BatchRequestInputBody module-attribute

EmbeddingRequest module-attribute

LogitsProcessors module-attribute

LogitsProcessors = list[
    Union[str, LogitsProcessorConstructor]
]

PoolingChatRequest module-attribute

PoolingChatRequest = EmbeddingChatRequest

PoolingCompletionRequest module-attribute

PoolingCompletionRequest = EmbeddingCompletionRequest

PoolingRequest module-attribute

TokenizeRequest module-attribute

_LONG_INFO module-attribute

_LONG_INFO = iinfo(long)

logger module-attribute

logger = init_logger(__name__)

BatchRequestInput

Bases: OpenAIBaseModel

The per-line object of the batch input file.

NOTE: Currently only the /v1/chat/completions endpoint is supported.

Source code in vllm/entrypoints/openai/protocol.py
class BatchRequestInput(OpenAIBaseModel):
    """
    The per-line object of the batch input file.

    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
    """

    # A developer-provided per-request id that will be used to match outputs to
    # inputs. Must be unique for each request in a batch.
    custom_id: str

    # The HTTP method to be used for the request. Currently only POST is
    # supported.
    method: str

    # The OpenAI API relative URL to be used for the request. Currently
    # /v1/chat/completions is supported.
    url: str

    # The parameters of the request.
    body: BatchRequestInputBody

    @field_validator('body', mode='plain')
    @classmethod
    def check_type_for_url(cls, value: Any, info: ValidationInfo):
        # Use url to disambiguate models
        url: str = info.data["url"]
        if url == "/v1/chat/completions":
            return ChatCompletionRequest.model_validate(value)
        if url == "/v1/embeddings":
            return TypeAdapter(EmbeddingRequest).validate_python(value)
        if url.endswith("/score"):
            return ScoreRequest.model_validate(value)
        if url.endswith("/rerank"):
            return RerankRequest.model_validate(value)
        return TypeAdapter(BatchRequestInputBody).validate_python(value)

body instance-attribute

custom_id instance-attribute

custom_id: str

method instance-attribute

method: str

url instance-attribute

url: str

check_type_for_url classmethod

check_type_for_url(value: Any, info: ValidationInfo)
Source code in vllm/entrypoints/openai/protocol.py
@field_validator('body', mode='plain')
@classmethod
def check_type_for_url(cls, value: Any, info: ValidationInfo):
    # Use url to disambiguate models
    url: str = info.data["url"]
    if url == "/v1/chat/completions":
        return ChatCompletionRequest.model_validate(value)
    if url == "/v1/embeddings":
        return TypeAdapter(EmbeddingRequest).validate_python(value)
    if url.endswith("/score"):
        return ScoreRequest.model_validate(value)
    if url.endswith("/rerank"):
        return RerankRequest.model_validate(value)
    return TypeAdapter(BatchRequestInputBody).validate_python(value)

BatchRequestOutput

Bases: OpenAIBaseModel

The per-line object of the batch output and error files

Source code in vllm/entrypoints/openai/protocol.py
class BatchRequestOutput(OpenAIBaseModel):
    """
    The per-line object of the batch output and error files
    """

    id: str

    # A developer-provided per-request id that will be used to match outputs to
    # inputs.
    custom_id: str

    response: Optional[BatchResponseData]

    # For requests that failed with a non-HTTP error, this will contain more
    # information on the cause of the failure.
    error: Optional[Any]

custom_id instance-attribute

custom_id: str

error instance-attribute

error: Optional[Any]

id instance-attribute

id: str

response instance-attribute

BatchResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class BatchResponseData(OpenAIBaseModel):
    # HTTP status code of the response.
    status_code: int = 200

    # An unique identifier for the API request.
    request_id: str

    # The body of the response.
    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
                         ScoreResponse, RerankResponse]] = None

body class-attribute instance-attribute

request_id instance-attribute

request_id: str

status_code class-attribute instance-attribute

status_code: int = 200

ChatCompletionLogProb

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProb(OpenAIBaseModel):
    token: str
    logprob: float = -9999.0
    bytes: Optional[list[int]] = None

bytes class-attribute instance-attribute

bytes: Optional[list[int]] = None

logprob class-attribute instance-attribute

logprob: float = -9999.0

token instance-attribute

token: str

ChatCompletionLogProbs

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProbs(OpenAIBaseModel):
    content: Optional[list[ChatCompletionLogProbsContent]] = None

content class-attribute instance-attribute

ChatCompletionLogProbsContent

Bases: ChatCompletionLogProb

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
    # Workaround: redefine fields name cache so that it's not
    # shared with the super class.
    field_names: ClassVar[Optional[set[str]]] = None
    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)

field_names class-attribute

field_names: Optional[set[str]] = None

top_logprobs class-attribute instance-attribute

top_logprobs: list[ChatCompletionLogProb] = Field(
    default_factory=list
)

ChatCompletionNamedFunction

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionNamedFunction(OpenAIBaseModel):
    name: str

name instance-attribute

name: str

ChatCompletionNamedToolChoiceParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
    function: ChatCompletionNamedFunction
    type: Literal["function"] = "function"

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
class ChatCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/chat/create
    messages: list[ChatCompletionMessageParam]
    model: Optional[str] = None
    frequency_penalty: Optional[float] = 0.0
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[bool] = False
    top_logprobs: Optional[int] = 0
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated=
        'max_tokens is deprecated in favor of the max_completion_tokens field')
    max_completion_tokens: Optional[int] = None
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0.0
    response_format: Optional[AnyResponseFormat] = None
    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    stop: Optional[Union[str, list[str]]] = []
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    tools: Optional[list[ChatCompletionToolsParam]] = None
    tool_choice: Optional[Union[
        Literal["none"],
        Literal["auto"],
        Literal["required"],
        ChatCompletionNamedToolChoiceParam,
    ]] = "none"

    # NOTE this will be ignored by vLLM -- the model determines the behavior
    parallel_tool_calls: Optional[bool] = False
    user: Optional[str] = None

    # --8<-- [start:chat-completion-sampling-params]
    best_of: Optional[int] = None
    use_beam_search: bool = False
    top_k: Optional[int] = None
    min_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    length_penalty: float = 1.0
    stop_token_ids: Optional[list[int]] = []
    include_stop_str_in_output: bool = False
    ignore_eos: bool = False
    min_tokens: int = 0
    skip_special_tokens: bool = True
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
    prompt_logprobs: Optional[int] = None
    allowed_token_ids: Optional[list[int]] = None
    bad_words: list[str] = Field(default_factory=list)
    # --8<-- [end:chat-completion-sampling-params]

    # --8<-- [start:chat-completion-extra-params]
    echo: bool = Field(
        default=False,
        description=(
            "If true, the new message will be prepended with the last message "
            "if they belong to the same role."),
    )
    add_generation_prompt: bool = Field(
        default=True,
        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
    continue_final_message: bool = Field(
        default=False,
        description=
        ("If this is set, the chat will be formatted so that the final "
         "message in the chat is open-ended, without any EOS tokens. The "
         "model will continue this message rather than starting a new one. "
         "This allows you to \"prefill\" part of the model's response for it. "
         "Cannot be used at the same time as `add_generation_prompt`."),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    documents: Optional[list[dict[str, str]]] = Field(
        default=None,
        description=
        ("A list of dicts representing documents that will be accessible to "
         "the model if it is performing RAG (retrieval-augmented generation)."
         " If the template does not support RAG, this argument will have no "
         "effect. We recommend that each document should be a dict containing "
         "\"title\" and \"text\" keys."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
        default=None,
        description=("If specified, the output will follow the JSON schema."),
    )
    guided_regex: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the output will follow the regex pattern."),
    )
    guided_choice: Optional[list[str]] = Field(
        default=None,
        description=(
            "If specified, the output will be exactly one of the choices."),
    )
    guided_grammar: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the output will follow the context free grammar."),
    )
    structural_tag: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the output will follow the structural tag schema."),
    )
    guided_decoding_backend: Optional[str] = Field(
        default=None,
        description=(
            "If specified, will override the default guided decoding backend "
            "of the server for this specific request. If set, must be either "
            "'outlines' / 'lm-format-enforcer'"),
    )
    guided_whitespace_pattern: Optional[str] = Field(
        default=None,
        description=(
            "If specified, will override the default whitespace pattern "
            "for guided json decoding."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
            "A list of either qualified names of logits processors, or "
            "constructor objects, to apply when sampling. A constructor is "
            "a JSON object with a required 'qualname' field specifying the "
            "qualified name of the processor class/factory, and optional "
            "'args' and 'kwargs' fields containing positional and keyword "
            "arguments. For example: {'qualname': "
            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
            "{'param': 'value'}}."))
    return_tokens_as_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified with 'logprobs', tokens are represented "
            " as strings of the form 'token_id:{token_id}' so that tokens "
            "that are not JSON-encodable can be identified."))
    cache_salt: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit). Not supported by vLLM engine V0."))
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )

    # --8<-- [end:chat-completion-extra-params]

    # Default sampling parameters for chat completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_beam_search_params(
            self, max_tokens: int,
            default_sampling_params: dict) -> BeamSearchParams:

        n = self.n if self.n is not None else 1
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            ignore_eos=self.ignore_eos,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self,
        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: dict,
    ) -> SamplingParams:

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        prompt_logprobs = self.prompt_logprobs
        if prompt_logprobs is None and self.echo:
            prompt_logprobs = self.top_logprobs

        guided_json_object = None
        if self.response_format is not None:
            if self.response_format.type == "json_object":
                guided_json_object = True
            elif self.response_format.type == "json_schema":
                json_schema = self.response_format.json_schema
                assert json_schema is not None
                self.guided_json = json_schema.json_schema
            elif self.response_format.type == "structural_tag":
                structural_tag = self.response_format
                assert structural_tag is not None and isinstance(
                    structural_tag, StructuralTagResponseFormat)
                s_tag_obj = structural_tag.model_dump(by_alias=True)
                self.structural_tag = json.dumps(s_tag_obj)

        guided_decoding = GuidedDecodingParams.from_optional(
            json=self._get_guided_json_from_tool() or self.guided_json,
            regex=self.guided_regex,
            choice=self.guided_choice,
            grammar=self.guided_grammar,
            json_object=guided_json_object,
            backend=self.guided_decoding_backend,
            whitespace_pattern=self.guided_whitespace_pattern,
            structural_tag=self.structural_tag,
        )

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            # Pass in kv_transfer_params via extra_args
            extra_args["kv_transfer_params"] = self.kv_transfer_params
        return SamplingParams.from_optional(
            n=self.n,
            best_of=self.best_of,
            presence_penalty=self.presence_penalty,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            seed=self.seed,
            stop=self.stop,
            stop_token_ids=self.stop_token_ids,
            logprobs=self.top_logprobs if self.logprobs else None,
            prompt_logprobs=prompt_logprobs,
            ignore_eos=self.ignore_eos,
            max_tokens=max_tokens,
            min_tokens=self.min_tokens,
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            logits_processors=get_logits_processors(self.logits_processors,
                                                    logits_processor_pattern),
            include_stop_str_in_output=self.include_stop_str_in_output,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
            guided_decoding=guided_decoding,
            logit_bias=self.logit_bias,
            bad_words= self.bad_words,
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
        )

    def _get_guided_json_from_tool(
            self) -> Optional[Union[str, dict, BaseModel]]:
        # user has chosen to not use any tool
        if self.tool_choice == "none" or self.tools is None:
            return None

        # user has chosen to use a named tool
        if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
            tool_name = self.tool_choice.function.name
            tools = {tool.function.name: tool.function for tool in self.tools}
            if tool_name not in tools:
                raise ValueError(
                    f"Tool '{tool_name}' has not been passed in `tools`.")
            tool = tools[tool_name]
            return tool.parameters

        if self.tool_choice == "required":
            # Pydantic schema generation cannot be used since the JSON schema
            # has to be constructed for a specific instantiation of a tool list
            # so that parameters of a function are correctly generated
            # based on the chosen function name
            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
                return {
                    "properties": {
                        "name": {
                            "type": "string",
                            "enum": [tool.function.name]
                        },
                        # parameters are always generated as '{}' in the final
                        # output if they are missing from the request
                        # (i.e. are None or '{}') so the schema is
                        # updated to produce an empty object in that case
                        "parameters": tool.function.parameters
                        if tool.function.parameters else {
                            "type": "object",
                            "properties": {}
                        }
                    },
                    "required": ["name", "parameters"]
                }

            json_schema = {
                "type": "array",
                "minItems": 1,
                "items": {
                    "type": "object",
                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
                }
            }
            return json_schema

        return None

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

    @model_validator(mode="before")
    @classmethod
    def check_logprobs(cls, data):
        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
            if data.get("stream") and prompt_logprobs > 0:
                raise ValueError(
                    "`prompt_logprobs` are not available when `stream=True`.")

            if prompt_logprobs < 0:
                raise ValueError("`prompt_logprobs` must be a positive value.")

        if (top_logprobs := data.get("top_logprobs")) is not None:
            if top_logprobs < 0:
                raise ValueError("`top_logprobs` must be a positive value.")

            if top_logprobs > 0 and not data.get("logprobs"):
                raise ValueError(
                    "when using `top_logprobs`, `logprobs` must be set to true."
                )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_guided_decoding_count(cls, data):
        if isinstance(data, ValueError):
            raise data

        guide_count = sum([
            "guided_json" in data and data["guided_json"] is not None,
            "guided_regex" in data and data["guided_regex"] is not None,
            "guided_choice" in data and data["guided_choice"] is not None
        ])
        # you can only use one kind of guided decoding
        if guide_count > 1:
            raise ValueError(
                "You can only use one kind of guided decoding "
                "('guided_json', 'guided_regex' or 'guided_choice').")
        # you can only either use guided decoding or tools, not both
        if guide_count > 1 and data.get("tool_choice", "none") not in (
                "none",
                "auto",
                "required",
        ):
            raise ValueError(
                "You can only either use guided decoding or tools, not both.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tool_usage(cls, data):

        # if "tool_choice" is not specified but tools are provided,
        # default to "auto" tool_choice
        if "tool_choice" not in data and data.get("tools"):
            data["tool_choice"] = "auto"

        # if "tool_choice" is "none" -- no validation is needed for tools
        if "tool_choice" in data and data["tool_choice"] == "none":
            return data

        # if "tool_choice" is specified -- validation
        if "tool_choice" in data:

            # ensure that if "tool choice" is specified, tools are present
            if "tools" not in data or data["tools"] is None:
                raise ValueError(
                    "When using `tool_choice`, `tools` must be set.")

            # make sure that tool choice is either a named tool
            # OR that it's set to "auto" or "required"
            if data["tool_choice"] not in [
                    "auto", "required"
            ] and not isinstance(data["tool_choice"], dict):
                raise NotImplementedError(
                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
                    'Only named tools, "none", "auto" or "required" '\
                    'are supported.'
                )

            # ensure that if "tool_choice" is specified as an object,
            # it matches a valid tool
            correct_usage_message = 'Correct usage: `{"type": "function",' \
                ' "function": {"name": "my_function"}}`'
            if isinstance(data["tool_choice"], dict):
                valid_tool = False
                function = data["tool_choice"].get("function")
                if not isinstance(function, dict):
                    raise ValueError(
                        f"Invalid value for `function`: `{function}` in "
                        f"`tool_choice`! {correct_usage_message}")
                if "name" not in function:
                    raise ValueError(f"Expected field `name` in `function` in "
                                     f"`tool_choice`! {correct_usage_message}")
                function_name = function["name"]
                if not isinstance(function_name,
                                  str) or len(function_name) == 0:
                    raise ValueError(
                        f"Invalid `name` in `function`: `{function_name}`"
                        f" in `tool_choice`! {correct_usage_message}")
                for tool in data["tools"]:
                    if tool["function"]["name"] == function_name:
                        valid_tool = True
                        break
                if not valid_tool:
                    raise ValueError(
                        "The tool specified in `tool_choice` does not match any"
                        " of the specified `tools`")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None:
            if not envs.VLLM_USE_V1:
                raise ValueError(
                    "Parameter 'cache_salt' is not supported with "
                    "this instance of vLLM, which uses engine V0.")
            if not isinstance(data["cache_salt"],
                              str) or not data["cache_salt"]:
                raise ValueError("Parameter 'cache_salt' must be a "
                                 "non-empty string if provided.")
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=True,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

bad_words class-attribute instance-attribute

bad_words: list[str] = Field(default_factory=list)

best_of class-attribute instance-attribute

best_of: Optional[int] = None

cache_salt class-attribute instance-attribute

cache_salt: Optional[str] = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit). Not supported by vLLM engine V0.",
)

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message class-attribute instance-attribute

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

documents class-attribute instance-attribute

documents: Optional[list[dict[str, str]]] = Field(
    default=None,
    description='A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing "title" and "text" keys.',
)

echo class-attribute instance-attribute

echo: bool = Field(
    default=False,
    description="If true, the new message will be prepended with the last message if they belong to the same role.",
)

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

guided_choice class-attribute instance-attribute

guided_choice: Optional[list[str]] = Field(
    default=None,
    description="If specified, the output will be exactly one of the choices.",
)

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: Optional[str] = Field(
    default=None,
    description="If specified, will override the default guided decoding backend of the server for this specific request. If set, must be either 'outlines' / 'lm-format-enforcer'",
)

guided_grammar class-attribute instance-attribute

guided_grammar: Optional[str] = Field(
    default=None,
    description="If specified, the output will follow the context free grammar.",
)

guided_json class-attribute instance-attribute

guided_json: Optional[Union[str, dict, BaseModel]] = Field(
    default=None,
    description="If specified, the output will follow the JSON schema.",
)

guided_regex class-attribute instance-attribute

guided_regex: Optional[str] = Field(
    default=None,
    description="If specified, the output will follow the regex pattern.",
)

guided_whitespace_pattern class-attribute instance-attribute

guided_whitespace_pattern: Optional[str] = Field(
    default=None,
    description="If specified, will override the default whitespace pattern for guided json decoding.",
)

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[str, float]] = None

logits_processors class-attribute instance-attribute

logits_processors: Optional[LogitsProcessors] = Field(
    default=None,
    description="A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}.",
)

logprobs class-attribute instance-attribute

logprobs: Optional[bool] = False

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: Optional[int] = None

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = Field(
    default=None,
    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
)

messages instance-attribute

min_p class-attribute instance-attribute

min_p: Optional[float] = None

min_tokens class-attribute instance-attribute

min_tokens: int = 0

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

n class-attribute instance-attribute

n: Optional[int] = 1

parallel_tool_calls class-attribute instance-attribute

parallel_tool_calls: Optional[bool] = False

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

response_format class-attribute instance-attribute

response_format: Optional[AnyResponseFormat] = None

return_tokens_as_token_ids class-attribute instance-attribute

return_tokens_as_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified with 'logprobs', tokens are represented  as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.",
)

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = []

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = []

stream class-attribute instance-attribute

stream: Optional[bool] = False

stream_options class-attribute instance-attribute

stream_options: Optional[StreamOptions] = None

structural_tag class-attribute instance-attribute

structural_tag: Optional[str] = Field(
    default=None,
    description="If specified, the output will follow the structural tag schema.",
)

temperature class-attribute instance-attribute

temperature: Optional[float] = None

tool_choice class-attribute instance-attribute

tool_choice: Optional[
    Union[
        Literal["none"],
        Literal["auto"],
        Literal["required"],
        ChatCompletionNamedToolChoiceParam,
    ]
] = "none"

tools class-attribute instance-attribute

top_k class-attribute instance-attribute

top_k: Optional[int] = None

top_logprobs class-attribute instance-attribute

top_logprobs: Optional[int] = 0

top_p class-attribute instance-attribute

top_p: Optional[float] = None

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=1)]
] = None
use_beam_search: bool = False

user class-attribute instance-attribute

user: Optional[str] = None

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

_get_guided_json_from_tool

_get_guided_json_from_tool() -> Optional[
    Union[str, dict, BaseModel]
]
Source code in vllm/entrypoints/openai/protocol.py
def _get_guided_json_from_tool(
        self) -> Optional[Union[str, dict, BaseModel]]:
    # user has chosen to not use any tool
    if self.tool_choice == "none" or self.tools is None:
        return None

    # user has chosen to use a named tool
    if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
        tool_name = self.tool_choice.function.name
        tools = {tool.function.name: tool.function for tool in self.tools}
        if tool_name not in tools:
            raise ValueError(
                f"Tool '{tool_name}' has not been passed in `tools`.")
        tool = tools[tool_name]
        return tool.parameters

    if self.tool_choice == "required":
        # Pydantic schema generation cannot be used since the JSON schema
        # has to be constructed for a specific instantiation of a tool list
        # so that parameters of a function are correctly generated
        # based on the chosen function name
        def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
            return {
                "properties": {
                    "name": {
                        "type": "string",
                        "enum": [tool.function.name]
                    },
                    # parameters are always generated as '{}' in the final
                    # output if they are missing from the request
                    # (i.e. are None or '{}') so the schema is
                    # updated to produce an empty object in that case
                    "parameters": tool.function.parameters
                    if tool.function.parameters else {
                        "type": "object",
                        "properties": {}
                    }
                },
                "required": ["name", "parameters"]
            }

        json_schema = {
            "type": "array",
            "minItems": 1,
            "items": {
                "type": "object",
                "anyOf": [get_tool_schema(tool) for tool in self.tools]
            }
        }
        return json_schema

    return None

check_cache_salt_support classmethod

check_cache_salt_support(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
    if data.get("cache_salt") is not None:
        if not envs.VLLM_USE_V1:
            raise ValueError(
                "Parameter 'cache_salt' is not supported with "
                "this instance of vLLM, which uses engine V0.")
        if not isinstance(data["cache_salt"],
                          str) or not data["cache_salt"]:
            raise ValueError("Parameter 'cache_salt' must be a "
                             "non-empty string if provided.")
    return data

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

check_guided_decoding_count classmethod

check_guided_decoding_count(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_guided_decoding_count(cls, data):
    if isinstance(data, ValueError):
        raise data

    guide_count = sum([
        "guided_json" in data and data["guided_json"] is not None,
        "guided_regex" in data and data["guided_regex"] is not None,
        "guided_choice" in data and data["guided_choice"] is not None
    ])
    # you can only use one kind of guided decoding
    if guide_count > 1:
        raise ValueError(
            "You can only use one kind of guided decoding "
            "('guided_json', 'guided_regex' or 'guided_choice').")
    # you can only either use guided decoding or tools, not both
    if guide_count > 1 and data.get("tool_choice", "none") not in (
            "none",
            "auto",
            "required",
    ):
        raise ValueError(
            "You can only either use guided decoding or tools, not both.")
    return data

check_logprobs classmethod

check_logprobs(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
    if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
        if data.get("stream") and prompt_logprobs > 0:
            raise ValueError(
                "`prompt_logprobs` are not available when `stream=True`.")

        if prompt_logprobs < 0:
            raise ValueError("`prompt_logprobs` must be a positive value.")

    if (top_logprobs := data.get("top_logprobs")) is not None:
        if top_logprobs < 0:
            raise ValueError("`top_logprobs` must be a positive value.")

        if top_logprobs > 0 and not data.get("logprobs"):
            raise ValueError(
                "when using `top_logprobs`, `logprobs` must be set to true."
            )

    return data

check_tool_usage classmethod

check_tool_usage(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_tool_usage(cls, data):

    # if "tool_choice" is not specified but tools are provided,
    # default to "auto" tool_choice
    if "tool_choice" not in data and data.get("tools"):
        data["tool_choice"] = "auto"

    # if "tool_choice" is "none" -- no validation is needed for tools
    if "tool_choice" in data and data["tool_choice"] == "none":
        return data

    # if "tool_choice" is specified -- validation
    if "tool_choice" in data:

        # ensure that if "tool choice" is specified, tools are present
        if "tools" not in data or data["tools"] is None:
            raise ValueError(
                "When using `tool_choice`, `tools` must be set.")

        # make sure that tool choice is either a named tool
        # OR that it's set to "auto" or "required"
        if data["tool_choice"] not in [
                "auto", "required"
        ] and not isinstance(data["tool_choice"], dict):
            raise NotImplementedError(
                f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
                'Only named tools, "none", "auto" or "required" '\
                'are supported.'
            )

        # ensure that if "tool_choice" is specified as an object,
        # it matches a valid tool
        correct_usage_message = 'Correct usage: `{"type": "function",' \
            ' "function": {"name": "my_function"}}`'
        if isinstance(data["tool_choice"], dict):
            valid_tool = False
            function = data["tool_choice"].get("function")
            if not isinstance(function, dict):
                raise ValueError(
                    f"Invalid value for `function`: `{function}` in "
                    f"`tool_choice`! {correct_usage_message}")
            if "name" not in function:
                raise ValueError(f"Expected field `name` in `function` in "
                                 f"`tool_choice`! {correct_usage_message}")
            function_name = function["name"]
            if not isinstance(function_name,
                              str) or len(function_name) == 0:
                raise ValueError(
                    f"Invalid `name` in `function`: `{function_name}`"
                    f" in `tool_choice`! {correct_usage_message}")
            for tool in data["tools"]:
                if tool["function"]["name"] == function_name:
                    valid_tool = True
                    break
            if not valid_tool:
                raise ValueError(
                    "The tool specified in `tool_choice` does not match any"
                    " of the specified `tools`")
    return data

to_beam_search_params

to_beam_search_params(
    max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams
Source code in vllm/entrypoints/openai/protocol.py
def to_beam_search_params(
        self, max_tokens: int,
        default_sampling_params: dict) -> BeamSearchParams:

    n = self.n if self.n is not None else 1
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

    return BeamSearchParams(
        beam_width=n,
        max_tokens=max_tokens,
        ignore_eos=self.ignore_eos,
        temperature=temperature,
        length_penalty=self.length_penalty,
        include_stop_str_in_output=self.include_stop_str_in_output,
    )

to_sampling_params

to_sampling_params(
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: dict,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
    self,
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: dict,
) -> SamplingParams:

    # Default parameters
    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    prompt_logprobs = self.prompt_logprobs
    if prompt_logprobs is None and self.echo:
        prompt_logprobs = self.top_logprobs

    guided_json_object = None
    if self.response_format is not None:
        if self.response_format.type == "json_object":
            guided_json_object = True
        elif self.response_format.type == "json_schema":
            json_schema = self.response_format.json_schema
            assert json_schema is not None
            self.guided_json = json_schema.json_schema
        elif self.response_format.type == "structural_tag":
            structural_tag = self.response_format
            assert structural_tag is not None and isinstance(
                structural_tag, StructuralTagResponseFormat)
            s_tag_obj = structural_tag.model_dump(by_alias=True)
            self.structural_tag = json.dumps(s_tag_obj)

    guided_decoding = GuidedDecodingParams.from_optional(
        json=self._get_guided_json_from_tool() or self.guided_json,
        regex=self.guided_regex,
        choice=self.guided_choice,
        grammar=self.guided_grammar,
        json_object=guided_json_object,
        backend=self.guided_decoding_backend,
        whitespace_pattern=self.guided_whitespace_pattern,
        structural_tag=self.structural_tag,
    )

    extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
    if self.kv_transfer_params:
        # Pass in kv_transfer_params via extra_args
        extra_args["kv_transfer_params"] = self.kv_transfer_params
    return SamplingParams.from_optional(
        n=self.n,
        best_of=self.best_of,
        presence_penalty=self.presence_penalty,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        seed=self.seed,
        stop=self.stop,
        stop_token_ids=self.stop_token_ids,
        logprobs=self.top_logprobs if self.logprobs else None,
        prompt_logprobs=prompt_logprobs,
        ignore_eos=self.ignore_eos,
        max_tokens=max_tokens,
        min_tokens=self.min_tokens,
        skip_special_tokens=self.skip_special_tokens,
        spaces_between_special_tokens=self.spaces_between_special_tokens,
        logits_processors=get_logits_processors(self.logits_processors,
                                                logits_processor_pattern),
        include_stop_str_in_output=self.include_stop_str_in_output,
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        output_kind=RequestOutputKind.DELTA if self.stream \
            else RequestOutputKind.FINAL_ONLY,
        guided_decoding=guided_decoding,
        logit_bias=self.logit_bias,
        bad_words= self.bad_words,
        allowed_token_ids=self.allowed_token_ids,
        extra_args=extra_args or None,
    )

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    if data.get("stream_options") and not data.get("stream"):
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

ChatCompletionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion"] = "chat.completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseChoice]
    usage: UsageInfo
    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None, description="KVTransfer parameters.")

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None, description="KVTransfer parameters."
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['chat.completion'] = 'chat.completion'

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[
    list[Optional[dict[int, Logprob]]]
] = None

usage instance-attribute

usage: UsageInfo

ChatCompletionResponseChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponseChoice(OpenAIBaseModel):
    index: int
    message: ChatMessage
    logprobs: Optional[ChatCompletionLogProbs] = None
    # per OpenAI spec this is the default
    finish_reason: Optional[str] = "stop"
    # not part of the OpenAI spec but included in vLLM for legacy reasons
    stop_reason: Optional[Union[int, str]] = None

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = 'stop'

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[ChatCompletionLogProbs] = None

message instance-attribute

message: ChatMessage

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

ChatCompletionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
    index: int
    delta: DeltaMessage
    logprobs: Optional[ChatCompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[ChatCompletionLogProbs] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

ChatCompletionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["chat.completion.chunk"] = (
    "chat.completion.chunk"
)

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

ChatCompletionToolsParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionToolsParam(OpenAIBaseModel):
    type: Literal["function"] = "function"
    function: FunctionDefinition

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatMessage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatMessage(OpenAIBaseModel):
    role: str
    reasoning_content: Optional[str] = None
    content: Optional[str] = None
    tool_calls: list[ToolCall] = Field(default_factory=list)

content class-attribute instance-attribute

content: Optional[str] = None

reasoning_content class-attribute instance-attribute

reasoning_content: Optional[str] = None

role instance-attribute

role: str

tool_calls class-attribute instance-attribute

tool_calls: list[ToolCall] = Field(default_factory=list)

ClassificationData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationData(OpenAIBaseModel):
    index: int
    label: Optional[str]
    probs: list[float]
    num_classes: int

index instance-attribute

index: int

label instance-attribute

label: Optional[str]

num_classes instance-attribute

num_classes: int

probs instance-attribute

probs: list[float]

ClassificationRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationRequest(OpenAIBaseModel):
    model: Optional[str] = None
    input: Union[list[str], str]
    truncate_prompt_tokens: Optional[int] = None
    user: Optional[str] = None

    # --8<-- [start:classification-pooling-params]
    additional_data: Optional[Any] = None
    # --8<-- [end:classification-pooling-params]

    # --8<-- [start:classification-extra-params]
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    # --8<-- [end:classification-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)

additional_data class-attribute instance-attribute

additional_data: Optional[Any] = None

input instance-attribute

input: Union[list[str], str]

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[int] = None

user class-attribute instance-attribute

user: Optional[str] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(additional_data=self.additional_data)

ClassificationResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[ClassificationData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"classify-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

CompletionLogProbs

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionLogProbs(OpenAIBaseModel):
    text_offset: list[int] = Field(default_factory=list)
    token_logprobs: list[Optional[float]] = Field(default_factory=list)
    tokens: list[str] = Field(default_factory=list)
    top_logprobs: list[Optional[dict[str,
                                     float]]] = Field(default_factory=list)

text_offset class-attribute instance-attribute

text_offset: list[int] = Field(default_factory=list)

token_logprobs class-attribute instance-attribute

token_logprobs: list[Optional[float]] = Field(
    default_factory=list
)

tokens class-attribute instance-attribute

tokens: list[str] = Field(default_factory=list)

top_logprobs class-attribute instance-attribute

top_logprobs: list[Optional[dict[str, float]]] = Field(
    default_factory=list
)

CompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/completions/create
    model: Optional[str] = None
    prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
    best_of: Optional[int] = None
    echo: Optional[bool] = False
    frequency_penalty: Optional[float] = 0.0
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[int] = None
    max_tokens: Optional[int] = 16
    n: int = 1
    presence_penalty: Optional[float] = 0.0
    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    stop: Optional[Union[str, list[str]]] = []
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    suffix: Optional[str] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    user: Optional[str] = None

    # --8<-- [start:completion-sampling-params]
    use_beam_search: bool = False
    top_k: Optional[int] = None
    min_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    length_penalty: float = 1.0
    stop_token_ids: Optional[list[int]] = []
    include_stop_str_in_output: bool = False
    ignore_eos: bool = False
    min_tokens: int = 0
    skip_special_tokens: bool = True
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
    allowed_token_ids: Optional[list[int]] = None
    prompt_logprobs: Optional[int] = None
    # --8<-- [end:completion-sampling-params]

    # --8<-- [start:completion-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    response_format: Optional[AnyResponseFormat] = Field(
        default=None,
        description=(
            "Similar to chat completion, this parameter specifies the format "
            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
            ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
        ),
    )
    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
        default=None,
        description="If specified, the output will follow the JSON schema.",
    )
    guided_regex: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the output will follow the regex pattern."),
    )
    guided_choice: Optional[list[str]] = Field(
        default=None,
        description=(
            "If specified, the output will be exactly one of the choices."),
    )
    guided_grammar: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the output will follow the context free grammar."),
    )
    guided_decoding_backend: Optional[str] = Field(
        default=None,
        description=(
            "If specified, will override the default guided decoding backend "
            "of the server for this specific request. If set, must be one of "
            "'outlines' / 'lm-format-enforcer'"),
    )
    guided_whitespace_pattern: Optional[str] = Field(
        default=None,
        description=(
            "If specified, will override the default whitespace pattern "
            "for guided json decoding."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
            "A list of either qualified names of logits processors, or "
            "constructor objects, to apply when sampling. A constructor is "
            "a JSON object with a required 'qualname' field specifying the "
            "qualified name of the processor class/factory, and optional "
            "'args' and 'kwargs' fields containing positional and keyword "
            "arguments. For example: {'qualname': "
            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
            "{'param': 'value'}}."))

    return_tokens_as_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified with 'logprobs', tokens are represented "
            " as strings of the form 'token_id:{token_id}' so that tokens "
            "that are not JSON-encodable can be identified."))

    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )

    # --8<-- [end:completion-extra-params]

    # Default sampling parameters for completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_beam_search_params(
        self,
        max_tokens: int,
        default_sampling_params: Optional[dict] = None,
    ) -> BeamSearchParams:

        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1

        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 1.0)

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            ignore_eos=self.ignore_eos,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self,
        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        prompt_logprobs = self.prompt_logprobs
        if prompt_logprobs is None and self.echo:
            prompt_logprobs = self.logprobs

        echo_without_generation = self.echo and self.max_tokens == 0

        guided_json_object = None
        if (self.response_format is not None
                and self.response_format.type == "json_object"):
            guided_json_object = True

        guided_decoding = GuidedDecodingParams.from_optional(
            json=self.guided_json,
            regex=self.guided_regex,
            choice=self.guided_choice,
            grammar=self.guided_grammar,
            json_object=guided_json_object,
            backend=self.guided_decoding_backend,
            whitespace_pattern=self.guided_whitespace_pattern,
        )

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            # Pass in kv_transfer_params via extra_args
            extra_args["kv_transfer_params"] = self.kv_transfer_params
        return SamplingParams.from_optional(
            n=self.n,
            best_of=self.best_of,
            presence_penalty=self.presence_penalty,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            seed=self.seed,
            stop=self.stop,
            stop_token_ids=self.stop_token_ids,
            logprobs=self.logprobs,
            ignore_eos=self.ignore_eos,
            max_tokens=max_tokens if not echo_without_generation else 1,
            min_tokens=self.min_tokens,
            prompt_logprobs=prompt_logprobs,
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
            logits_processors=get_logits_processors(self.logits_processors,
                                                    logits_processor_pattern),
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
            guided_decoding=guided_decoding,
            logit_bias=self.logit_bias,
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
            )

    @model_validator(mode="before")
    @classmethod
    def check_guided_decoding_count(cls, data):
        guide_count = sum([
            "guided_json" in data and data["guided_json"] is not None,
            "guided_regex" in data and data["guided_regex"] is not None,
            "guided_choice" in data and data["guided_choice"] is not None
        ])
        if guide_count > 1:
            raise ValueError(
                "You can only use one kind of guided decoding "
                "('guided_json', 'guided_regex' or 'guided_choice').")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_logprobs(cls, data):
        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
            if data.get("stream") and prompt_logprobs > 0:
                raise ValueError(
                    "`prompt_logprobs` are not available when `stream=True`.")

            if prompt_logprobs < 0:
                raise ValueError("`prompt_logprobs` must be a positive value.")

        if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
            raise ValueError("`logprobs` must be a positive value.")

        return data

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

    @model_validator(mode="before")
    @classmethod
    def validate_prompt_and_prompt_embeds(cls, data):
        if data.get("prompt") is None and data.get("prompt_embeds") is None:
            raise ValueError(
                "At least one of `prompt` or `prompt_embeds` must be set.")
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

best_of class-attribute instance-attribute

best_of: Optional[int] = None

echo class-attribute instance-attribute

echo: Optional[bool] = False

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

guided_choice class-attribute instance-attribute

guided_choice: Optional[list[str]] = Field(
    default=None,
    description="If specified, the output will be exactly one of the choices.",
)

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: Optional[str] = Field(
    default=None,
    description="If specified, will override the default guided decoding backend of the server for this specific request. If set, must be one of 'outlines' / 'lm-format-enforcer'",
)

guided_grammar class-attribute instance-attribute

guided_grammar: Optional[str] = Field(
    default=None,
    description="If specified, the output will follow the context free grammar.",
)

guided_json class-attribute instance-attribute

guided_json: Optional[Union[str, dict, BaseModel]] = Field(
    default=None,
    description="If specified, the output will follow the JSON schema.",
)

guided_regex class-attribute instance-attribute

guided_regex: Optional[str] = Field(
    default=None,
    description="If specified, the output will follow the regex pattern.",
)

guided_whitespace_pattern class-attribute instance-attribute

guided_whitespace_pattern: Optional[str] = Field(
    default=None,
    description="If specified, will override the default whitespace pattern for guided json decoding.",
)

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[str, float]] = None

logits_processors class-attribute instance-attribute

logits_processors: Optional[LogitsProcessors] = Field(
    default=None,
    description="A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}.",
)

logprobs class-attribute instance-attribute

logprobs: Optional[int] = None

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = 16

min_p class-attribute instance-attribute

min_p: Optional[float] = None

min_tokens class-attribute instance-attribute

min_tokens: int = 0

model class-attribute instance-attribute

model: Optional[str] = None

n class-attribute instance-attribute

n: int = 1

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt class-attribute instance-attribute

prompt: Optional[
    Union[list[int], list[list[int]], str, list[str]]
] = None

prompt_embeds class-attribute instance-attribute

prompt_embeds: Optional[Union[bytes, list[bytes]]] = None

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

response_format class-attribute instance-attribute

response_format: Optional[AnyResponseFormat] = Field(
    default=None,
    description="Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'}, {'type': 'json_schema'}, {'type': 'structural_tag'}, or {'type': 'text' } is supported.",
)

return_tokens_as_token_ids class-attribute instance-attribute

return_tokens_as_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified with 'logprobs', tokens are represented  as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.",
)

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = []

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = []

stream class-attribute instance-attribute

stream: Optional[bool] = False

stream_options class-attribute instance-attribute

stream_options: Optional[StreamOptions] = None

suffix class-attribute instance-attribute

suffix: Optional[str] = None

temperature class-attribute instance-attribute

temperature: Optional[float] = None

top_k class-attribute instance-attribute

top_k: Optional[int] = None

top_p class-attribute instance-attribute

top_p: Optional[float] = None

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=1)]
] = None
use_beam_search: bool = False

user class-attribute instance-attribute

user: Optional[str] = None

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

check_guided_decoding_count classmethod

check_guided_decoding_count(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_guided_decoding_count(cls, data):
    guide_count = sum([
        "guided_json" in data and data["guided_json"] is not None,
        "guided_regex" in data and data["guided_regex"] is not None,
        "guided_choice" in data and data["guided_choice"] is not None
    ])
    if guide_count > 1:
        raise ValueError(
            "You can only use one kind of guided decoding "
            "('guided_json', 'guided_regex' or 'guided_choice').")
    return data

check_logprobs classmethod

check_logprobs(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
    if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
        if data.get("stream") and prompt_logprobs > 0:
            raise ValueError(
                "`prompt_logprobs` are not available when `stream=True`.")

        if prompt_logprobs < 0:
            raise ValueError("`prompt_logprobs` must be a positive value.")

    if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
        raise ValueError("`logprobs` must be a positive value.")

    return data

to_beam_search_params

to_beam_search_params(
    max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> BeamSearchParams
Source code in vllm/entrypoints/openai/protocol.py
def to_beam_search_params(
    self,
    max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> BeamSearchParams:

    if default_sampling_params is None:
        default_sampling_params = {}
    n = self.n if self.n is not None else 1

    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get("temperature", 1.0)

    return BeamSearchParams(
        beam_width=n,
        max_tokens=max_tokens,
        ignore_eos=self.ignore_eos,
        temperature=temperature,
        length_penalty=self.length_penalty,
        include_stop_str_in_output=self.include_stop_str_in_output,
    )

to_sampling_params

to_sampling_params(
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
    self,
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams:

    if default_sampling_params is None:
        default_sampling_params = {}

    # Default parameters
    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    prompt_logprobs = self.prompt_logprobs
    if prompt_logprobs is None and self.echo:
        prompt_logprobs = self.logprobs

    echo_without_generation = self.echo and self.max_tokens == 0

    guided_json_object = None
    if (self.response_format is not None
            and self.response_format.type == "json_object"):
        guided_json_object = True

    guided_decoding = GuidedDecodingParams.from_optional(
        json=self.guided_json,
        regex=self.guided_regex,
        choice=self.guided_choice,
        grammar=self.guided_grammar,
        json_object=guided_json_object,
        backend=self.guided_decoding_backend,
        whitespace_pattern=self.guided_whitespace_pattern,
    )

    extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
    if self.kv_transfer_params:
        # Pass in kv_transfer_params via extra_args
        extra_args["kv_transfer_params"] = self.kv_transfer_params
    return SamplingParams.from_optional(
        n=self.n,
        best_of=self.best_of,
        presence_penalty=self.presence_penalty,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        seed=self.seed,
        stop=self.stop,
        stop_token_ids=self.stop_token_ids,
        logprobs=self.logprobs,
        ignore_eos=self.ignore_eos,
        max_tokens=max_tokens if not echo_without_generation else 1,
        min_tokens=self.min_tokens,
        prompt_logprobs=prompt_logprobs,
        skip_special_tokens=self.skip_special_tokens,
        spaces_between_special_tokens=self.spaces_between_special_tokens,
        include_stop_str_in_output=self.include_stop_str_in_output,
        logits_processors=get_logits_processors(self.logits_processors,
                                                logits_processor_pattern),
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        output_kind=RequestOutputKind.DELTA if self.stream \
            else RequestOutputKind.FINAL_ONLY,
        guided_decoding=guided_decoding,
        logit_bias=self.logit_bias,
        allowed_token_ids=self.allowed_token_ids,
        extra_args=extra_args or None,
        )

validate_prompt_and_prompt_embeds classmethod

validate_prompt_and_prompt_embeds(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_prompt_and_prompt_embeds(cls, data):
    if data.get("prompt") is None and data.get("prompt_embeds") is None:
        raise ValueError(
            "At least one of `prompt` or `prompt_embeds` must be set.")
    return data

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    if data.get("stream_options") and not data.get("stream"):
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

CompletionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
    object: str = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[CompletionResponseChoice]
    usage: UsageInfo
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None, description="KVTransfer parameters.")

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"cmpl-{random_uuid()}"
)

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None, description="KVTransfer parameters."
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'text_completion'

usage instance-attribute

usage: UsageInfo

CompletionResponseChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponseChoice(OpenAIBaseModel):
    index: int
    text: str
    logprobs: Optional[CompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = Field(
        default=None,
        description=(
            "The stop string or token id that caused the completion "
            "to stop, None if the completion finished for some other reason "
            "including encountering the EOS token"),
    )
    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[CompletionLogProbs] = None

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[
    list[Optional[dict[int, Logprob]]]
] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = Field(
    default=None,
    description="The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token",
)

text instance-attribute

text: str

CompletionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponseStreamChoice(OpenAIBaseModel):
    index: int
    text: str
    logprobs: Optional[CompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = Field(
        default=None,
        description=(
            "The stop string or token id that caused the completion "
            "to stop, None if the completion finished for some other reason "
            "including encountering the EOS token"),
    )

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[CompletionLogProbs] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = Field(
    default=None,
    description="The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token",
)

text instance-attribute

text: str

CompletionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
    object: str = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[CompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"cmpl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'text_completion'

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

DeltaFunctionCall

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaFunctionCall(BaseModel):
    name: Optional[str] = None
    arguments: Optional[str] = None

arguments class-attribute instance-attribute

arguments: Optional[str] = None

name class-attribute instance-attribute

name: Optional[str] = None

DeltaMessage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaMessage(OpenAIBaseModel):
    role: Optional[str] = None
    content: Optional[str] = None
    reasoning_content: Optional[str] = None
    tool_calls: list[DeltaToolCall] = Field(default_factory=list)

content class-attribute instance-attribute

content: Optional[str] = None

reasoning_content class-attribute instance-attribute

reasoning_content: Optional[str] = None

role class-attribute instance-attribute

role: Optional[str] = None

tool_calls class-attribute instance-attribute

tool_calls: list[DeltaToolCall] = Field(
    default_factory=list
)

DeltaToolCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaToolCall(OpenAIBaseModel):
    id: Optional[str] = None
    type: Optional[Literal["function"]] = None
    index: int
    function: Optional[DeltaFunctionCall] = None

function class-attribute instance-attribute

function: Optional[DeltaFunctionCall] = None

id class-attribute instance-attribute

id: Optional[str] = None

index instance-attribute

index: int

type class-attribute instance-attribute

type: Optional[Literal['function']] = None

DetokenizeRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DetokenizeRequest(OpenAIBaseModel):
    model: Optional[str] = None
    tokens: list[int]

model class-attribute instance-attribute

model: Optional[str] = None

tokens instance-attribute

tokens: list[int]

DetokenizeResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DetokenizeResponse(OpenAIBaseModel):
    prompt: str

prompt instance-attribute

prompt: str

EmbeddingChatRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingChatRequest(OpenAIBaseModel):
    model: Optional[str] = None
    messages: list[ChatCompletionMessageParam]

    encoding_format: Literal["float", "base64"] = "float"
    dimensions: Optional[int] = None
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:chat-embedding-pooling-params]
    additional_data: Optional[Any] = None
    # --8<-- [end:chat-embedding-pooling-params]

    # --8<-- [start:chat-embedding-extra-params]
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    # --8<-- [end:chat-embedding-extra-params]

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

    def to_pooling_params(self):
        return PoolingParams(dimensions=self.dimensions,
                             additional_data=self.additional_data)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

additional_data class-attribute instance-attribute

additional_data: Optional[Any] = None

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

encoding_format class-attribute instance-attribute

encoding_format: Literal['float', 'base64'] = 'float'

messages instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

user class-attribute instance-attribute

user: Optional[str] = None

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(dimensions=self.dimensions,
                         additional_data=self.additional_data)

EmbeddingCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/embeddings
    model: Optional[str] = None
    input: Union[list[int], list[list[int]], str, list[str]]
    encoding_format: Literal["float", "base64"] = "float"
    dimensions: Optional[int] = None
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:embedding-pooling-params]
    additional_data: Optional[Any] = None
    # --8<-- [end:embedding-pooling-params]

    # --8<-- [start:embedding-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    # --8<-- [end:embedding-extra-params]

    def to_pooling_params(self):
        return PoolingParams(dimensions=self.dimensions,
                             additional_data=self.additional_data)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

additional_data class-attribute instance-attribute

additional_data: Optional[Any] = None

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

encoding_format class-attribute instance-attribute

encoding_format: Literal['float', 'base64'] = 'float'

input instance-attribute

input: Union[list[int], list[list[int]], str, list[str]]

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

user class-attribute instance-attribute

user: Optional[str] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(dimensions=self.dimensions,
                         additional_data=self.additional_data)

EmbeddingResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[EmbeddingResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

EmbeddingResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingResponseData(OpenAIBaseModel):
    index: int
    object: str = "embedding"
    embedding: Union[list[float], str]

embedding instance-attribute

embedding: Union[list[float], str]

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'embedding'

ErrorResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ErrorResponse(OpenAIBaseModel):
    object: str = "error"
    message: str
    type: str
    param: Optional[str] = None
    code: int

code instance-attribute

code: int

message instance-attribute

message: str

object class-attribute instance-attribute

object: str = 'error'

param class-attribute instance-attribute

param: Optional[str] = None

type instance-attribute

type: str

ExtractedToolCallInformation

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ExtractedToolCallInformation(BaseModel):
    # indicate if tools were called
    tools_called: bool

    # extracted tool calls
    tool_calls: list[ToolCall]

    # content - per OpenAI spec, content AND tool calls can be returned rarely
    # But some models will do this intentionally
    content: Optional[str] = None

content class-attribute instance-attribute

content: Optional[str] = None

tool_calls instance-attribute

tool_calls: list[ToolCall]

tools_called instance-attribute

tools_called: bool

FunctionCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class FunctionCall(OpenAIBaseModel):
    name: str
    arguments: str

arguments instance-attribute

arguments: str

name instance-attribute

name: str

FunctionDefinition

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class FunctionDefinition(OpenAIBaseModel):
    name: str
    description: Optional[str] = None
    parameters: Optional[dict[str, Any]] = None

description class-attribute instance-attribute

description: Optional[str] = None

name instance-attribute

name: str

parameters class-attribute instance-attribute

parameters: Optional[dict[str, Any]] = None

JsonSchemaResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class JsonSchemaResponseFormat(OpenAIBaseModel):
    name: str
    description: Optional[str] = None
    # schema is the field in openai but that causes conflicts with pydantic so
    # instead use json_schema with an alias
    json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
    strict: Optional[bool] = None

description class-attribute instance-attribute

description: Optional[str] = None

json_schema class-attribute instance-attribute

json_schema: Optional[dict[str, Any]] = Field(
    default=None, alias="schema"
)

name instance-attribute

name: str

strict class-attribute instance-attribute

strict: Optional[bool] = None

LoadLoRAAdapterRequest

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class LoadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_path: str

lora_name instance-attribute

lora_name: str

lora_path instance-attribute

lora_path: str

LogitsProcessorConstructor

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class LogitsProcessorConstructor(BaseModel):
    qualname: str
    args: Optional[list[Any]] = None
    kwargs: Optional[dict[str, Any]] = None

    model_config = ConfigDict(extra="forbid")

args class-attribute instance-attribute

args: Optional[list[Any]] = None

kwargs class-attribute instance-attribute

kwargs: Optional[dict[str, Any]] = None

model_config class-attribute instance-attribute

model_config = ConfigDict(extra='forbid')

qualname instance-attribute

qualname: str

ModelCard

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelCard(OpenAIBaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "vllm"
    root: Optional[str] = None
    parent: Optional[str] = None
    max_model_len: Optional[int] = None
    permission: list[ModelPermission] = Field(default_factory=list)

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id instance-attribute

id: str

max_model_len class-attribute instance-attribute

max_model_len: Optional[int] = None

object class-attribute instance-attribute

object: str = 'model'

owned_by class-attribute instance-attribute

owned_by: str = 'vllm'

parent class-attribute instance-attribute

parent: Optional[str] = None

permission class-attribute instance-attribute

permission: list[ModelPermission] = Field(
    default_factory=list
)

root class-attribute instance-attribute

root: Optional[str] = None

ModelList

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelList(OpenAIBaseModel):
    object: str = "list"
    data: list[ModelCard] = Field(default_factory=list)

data class-attribute instance-attribute

data: list[ModelCard] = Field(default_factory=list)

object class-attribute instance-attribute

object: str = 'list'

ModelPermission

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelPermission(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
    object: str = "model_permission"
    created: int = Field(default_factory=lambda: int(time.time()))
    allow_create_engine: bool = False
    allow_sampling: bool = True
    allow_logprobs: bool = True
    allow_search_indices: bool = False
    allow_view: bool = True
    allow_fine_tuning: bool = False
    organization: str = "*"
    group: Optional[str] = None
    is_blocking: bool = False

allow_create_engine class-attribute instance-attribute

allow_create_engine: bool = False

allow_fine_tuning class-attribute instance-attribute

allow_fine_tuning: bool = False

allow_logprobs class-attribute instance-attribute

allow_logprobs: bool = True

allow_sampling class-attribute instance-attribute

allow_sampling: bool = True

allow_search_indices class-attribute instance-attribute

allow_search_indices: bool = False

allow_view class-attribute instance-attribute

allow_view: bool = True

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

group class-attribute instance-attribute

group: Optional[str] = None

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"modelperm-{random_uuid()}"
)

is_blocking class-attribute instance-attribute

is_blocking: bool = False

object class-attribute instance-attribute

object: str = 'model_permission'

organization class-attribute instance-attribute

organization: str = '*'

OpenAIBaseModel

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class OpenAIBaseModel(BaseModel):
    # OpenAI API does allow extra fields
    model_config = ConfigDict(extra="allow")

    # Cache class field names
    field_names: ClassVar[Optional[set[str]]] = None

    @model_validator(mode="wrap")
    @classmethod
    def __log_extra_fields__(cls, data, handler):
        result = handler(data)
        if not isinstance(data, dict):
            return result
        field_names = cls.field_names
        if field_names is None:
            # Get all class field names and their potential aliases
            field_names = set()
            for field_name, field in cls.model_fields.items():
                field_names.add(field_name)
                if alias := getattr(field, "alias", None):
                    field_names.add(alias)
            cls.field_names = field_names

        # Compare against both field names and aliases
        if any(k not in field_names for k in data):
            logger.warning(
                "The following fields were present in the request "
                "but ignored: %s",
                data.keys() - field_names,
            )
        return result

field_names class-attribute

field_names: Optional[set[str]] = None

model_config class-attribute instance-attribute

model_config = ConfigDict(extra='allow')

__log_extra_fields__ classmethod

__log_extra_fields__(data, handler)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="wrap")
@classmethod
def __log_extra_fields__(cls, data, handler):
    result = handler(data)
    if not isinstance(data, dict):
        return result
    field_names = cls.field_names
    if field_names is None:
        # Get all class field names and their potential aliases
        field_names = set()
        for field_name, field in cls.model_fields.items():
            field_names.add(field_name)
            if alias := getattr(field, "alias", None):
                field_names.add(alias)
        cls.field_names = field_names

    # Compare against both field names and aliases
    if any(k not in field_names for k in data):
        logger.warning(
            "The following fields were present in the request "
            "but ignored: %s",
            data.keys() - field_names,
        )
    return result

PoolingResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PoolingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[PoolingResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"pool-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

PoolingResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PoolingResponseData(OpenAIBaseModel):
    index: int
    object: str = "pooling"
    data: Union[list[list[float]], list[float], str]

data instance-attribute

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'pooling'

PromptTokenUsageInfo

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PromptTokenUsageInfo(OpenAIBaseModel):
    cached_tokens: Optional[int] = None

cached_tokens class-attribute instance-attribute

cached_tokens: Optional[int] = None

RequestResponseMetadata

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RequestResponseMetadata(BaseModel):
    request_id: str
    final_usage_info: Optional[UsageInfo] = None

final_usage_info class-attribute instance-attribute

final_usage_info: Optional[UsageInfo] = None

request_id instance-attribute

request_id: str

RerankDocument

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankDocument(BaseModel):
    text: str

text instance-attribute

text: str

RerankRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankRequest(OpenAIBaseModel):
    model: Optional[str] = None
    query: str
    documents: list[str]
    top_n: int = Field(default_factory=lambda: 0)
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:rerank-pooling-params]
    additional_data: Optional[Any] = None
    # --8<-- [end:rerank-pooling-params]

    # --8<-- [start:rerank-extra-params]
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    # --8<-- [end:rerank-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)

additional_data class-attribute instance-attribute

additional_data: Optional[Any] = None

documents instance-attribute

documents: list[str]

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

query instance-attribute

query: str

top_n class-attribute instance-attribute

top_n: int = Field(default_factory=lambda: 0)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(additional_data=self.additional_data)

RerankResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankResponse(OpenAIBaseModel):
    id: str
    model: str
    usage: RerankUsage
    results: list[RerankResult]

id instance-attribute

id: str

model instance-attribute

model: str

results instance-attribute

results: list[RerankResult]

usage instance-attribute

usage: RerankUsage

RerankResult

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankResult(BaseModel):
    index: int
    document: RerankDocument
    relevance_score: float

document instance-attribute

document: RerankDocument

index instance-attribute

index: int

relevance_score instance-attribute

relevance_score: float

RerankUsage

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankUsage(BaseModel):
    total_tokens: int

total_tokens instance-attribute

total_tokens: int

ResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponseFormat(OpenAIBaseModel):
    # type must be "json_schema", "json_object", or "text"
    type: Literal["text", "json_object", "json_schema"]
    json_schema: Optional[JsonSchemaResponseFormat] = None

json_schema class-attribute instance-attribute

json_schema: Optional[JsonSchemaResponseFormat] = None

type instance-attribute

type: Literal['text', 'json_object', 'json_schema']

ScoreRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreRequest(OpenAIBaseModel):
    model: Optional[str] = None
    text_1: Union[list[str], str]
    text_2: Union[list[str], str]
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:score-pooling-params]
    additional_data: Optional[Any] = None
    # --8<-- [end:score-pooling-params]

    # --8<-- [start:score-extra-params]
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    # --8<-- [end:score-extra-params]

    def to_pooling_params(self):
        return PoolingParams(additional_data=self.additional_data)

additional_data class-attribute instance-attribute

additional_data: Optional[Any] = None

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

text_1 instance-attribute

text_1: Union[list[str], str]

text_2 instance-attribute

text_2: Union[list[str], str]

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(additional_data=self.additional_data)

ScoreResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[ScoreResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

ScoreResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreResponseData(OpenAIBaseModel):
    index: int
    object: str = "score"
    score: float

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'score'

score instance-attribute

score: float

StreamOptions

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StreamOptions(OpenAIBaseModel):
    include_usage: Optional[bool] = True
    continuous_usage_stats: Optional[bool] = False

continuous_usage_stats class-attribute instance-attribute

continuous_usage_stats: Optional[bool] = False

include_usage class-attribute instance-attribute

include_usage: Optional[bool] = True

StructuralTag

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StructuralTag(OpenAIBaseModel):
    begin: str
    # schema is the field, but that causes conflicts with pydantic so
    # instead use structural_tag_schema with an alias
    structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
                                                            alias="schema")
    end: str

begin instance-attribute

begin: str

end instance-attribute

end: str

structural_tag_schema class-attribute instance-attribute

structural_tag_schema: Optional[dict[str, Any]] = Field(
    default=None, alias="schema"
)

StructuralTagResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StructuralTagResponseFormat(OpenAIBaseModel):
    type: Literal["structural_tag"]
    structures: list[StructuralTag]
    triggers: list[str]

structures instance-attribute

structures: list[StructuralTag]

triggers instance-attribute

triggers: list[str]

type instance-attribute

type: Literal['structural_tag']

TokenizeChatRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeChatRequest(OpenAIBaseModel):
    model: Optional[str] = None
    messages: list[ChatCompletionMessageParam]

    add_generation_prompt: bool = Field(
        default=True,
        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
    return_token_strs: Optional[bool] = Field(
        default=False,
        description=("If true, also return the token strings "
                     "corresponding to the token ids."),
    )
    continue_final_message: bool = Field(
        default=False,
        description=
        ("If this is set, the chat will be formatted so that the final "
         "message in the chat is open-ended, without any EOS tokens. The "
         "model will continue this message rather than starting a new one. "
         "This allows you to \"prefill\" part of the model's response for it. "
         "Cannot be used at the same time as `add_generation_prompt`."),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    tools: Optional[list[ChatCompletionToolsParam]] = Field(
        default=None,
        description=("A list of tools the model may call."),
    )

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=True,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message class-attribute instance-attribute

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

messages instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

return_token_strs class-attribute instance-attribute

return_token_strs: Optional[bool] = Field(
    default=False,
    description="If true, also return the token strings corresponding to the token ids.",
)

tools class-attribute instance-attribute

tools: Optional[list[ChatCompletionToolsParam]] = Field(
    default=None,
    description="A list of tools the model may call.",
)

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

TokenizeCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeCompletionRequest(OpenAIBaseModel):
    model: Optional[str] = None
    prompt: str

    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    return_token_strs: Optional[bool] = Field(
        default=False,
        description=("If true, also return the token strings "
                     "corresponding to the token ids."),
    )

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

model class-attribute instance-attribute

model: Optional[str] = None

prompt instance-attribute

prompt: str

return_token_strs class-attribute instance-attribute

return_token_strs: Optional[bool] = Field(
    default=False,
    description="If true, also return the token strings corresponding to the token ids.",
)

TokenizeResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeResponse(OpenAIBaseModel):
    count: int
    max_model_len: int
    tokens: list[int]
    token_strs: Optional[list[str]] = None

count instance-attribute

count: int

max_model_len instance-attribute

max_model_len: int

token_strs class-attribute instance-attribute

token_strs: Optional[list[str]] = None

tokens instance-attribute

tokens: list[int]

ToolCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ToolCall(OpenAIBaseModel):
    id: str = Field(default_factory=random_tool_call_id)
    type: Literal["function"] = "function"
    function: FunctionCall

function instance-attribute

function: FunctionCall

id class-attribute instance-attribute

id: str = Field(default_factory=random_tool_call_id)

type class-attribute instance-attribute

type: Literal['function'] = 'function'

TranscriptionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranscription

    file: UploadFile
    """
    The audio file object (not file name) to transcribe, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: Optional[str] = None
    """ID of the model to use.
    """

    language: Optional[str] = None
    """The language of the input audio.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy and latency.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    ## TODO (varun) : Support if set to 0, certain thresholds are met !!

    timestamp_granularities: list[Literal["word", "segment"]] = Field(
        alias="timestamp_granularities[]", default=[])
    """The timestamp granularities to populate for this transcription.

    `response_format` must be set `verbose_json` to use timestamp granularities.
    Either or both of these options are supported: `word`, or `segment`. Note:
    There is no additional latency for segment timestamps, but generating word
    timestamps incurs additional latency.
    """

    stream: Optional[bool] = False
    """When set, it will enable output to be streamed in a similar fashion 
    as the Chat Completion endpoint.
    """
    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )
    # --8<-- [end:transcription-extra-params]

    # --8<-- [start:transcription-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """

    top_p: Optional[float] = None
    """Enables nucleus (top-p) sampling, where tokens are selected from the
    smallest possible set whose cumulative probability exceeds `p`.
    """

    top_k: Optional[int] = None
    """Limits sampling to the `k` most probable tokens at each step."""

    min_p: Optional[float] = None
    """Filters out tokens with a probability lower than `min_p`, ensuring a
    minimum likelihood threshold during sampling.
    """

    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    frequency_penalty: Optional[float] = 0.0
    """The frequency penalty to use for sampling."""

    repetition_penalty: Optional[float] = None
    """The repetition penalty to use for sampling."""

    presence_penalty: Optional[float] = 0.0
    """The presence penalty to use for sampling."""
    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_sampling_params(
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:

        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])

        return SamplingParams.from_optional(temperature=temperature,
                                            max_tokens=max_tokens,
                                            seed=self.seed,
                                            top_p=top_p,
                                            top_k=top_k,
                                            min_p=min_p,
                                            frequency_penalty=self.frequency_penalty,
                                            repetition_penalty=repetition_penalty,
                                            presence_penalty=self.presence_penalty,
                                            output_kind=RequestOutputKind.DELTA
                                            if self.stream \
                                            else RequestOutputKind.FINAL_ONLY,
                                            extra_args=self.vllm_xargs)

    @model_validator(mode="before")
    @classmethod
    def validate_transcription_request(cls, data):
        if isinstance(data.get("file"), str):
            raise HTTPException(
                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
                detail="Expected 'file' to be a file-like object, not 'str'.",
            )

        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

file instance-attribute

file: UploadFile

The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

The frequency penalty to use for sampling.

language class-attribute instance-attribute

language: Optional[str] = None

The language of the input audio.

Supplying the input language in ISO-639-1 format will improve accuracy and latency.

min_p class-attribute instance-attribute

min_p: Optional[float] = None

Filters out tokens with a probability lower than min_p, ensuring a minimum likelihood threshold during sampling.

model class-attribute instance-attribute

model: Optional[str] = None

ID of the model to use.

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

The presence penalty to use for sampling.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

The repetition penalty to use for sampling.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: Optional[bool] = False

When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: Optional[bool] = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: Optional[bool] = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

timestamp_granularities class-attribute instance-attribute

timestamp_granularities: list[
    Literal["word", "segment"]
] = Field(alias="timestamp_granularities[]", default=[])

The timestamp granularities to populate for this transcription.

response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.

top_k class-attribute instance-attribute

top_k: Optional[int] = None

Limits sampling to the k most probable tokens at each step.

top_p class-attribute instance-attribute

top_p: Optional[float] = None

Enables nucleus (top-p) sampling, where tokens are selected from the smallest possible set whose cumulative probability exceeds p.

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: Optional[dict] = None) -> SamplingParams:

    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}

    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])

    return SamplingParams.from_optional(temperature=temperature,
                                        max_tokens=max_tokens,
                                        seed=self.seed,
                                        top_p=top_p,
                                        top_k=top_k,
                                        min_p=min_p,
                                        frequency_penalty=self.frequency_penalty,
                                        repetition_penalty=repetition_penalty,
                                        presence_penalty=self.presence_penalty,
                                        output_kind=RequestOutputKind.DELTA
                                        if self.stream \
                                        else RequestOutputKind.FINAL_ONLY,
                                        extra_args=self.vllm_xargs)

validate_transcription_request classmethod

validate_transcription_request(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_transcription_request(cls, data):
    if isinstance(data.get("file"), str):
        raise HTTPException(
            status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
            detail="Expected 'file' to be a file-like object, not 'str'.",
        )

    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

TranscriptionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""

text instance-attribute

text: str

The transcribed text.

TranscriptionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

TranscriptionResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The transcribed text."""

    segments: Optional[list[TranscriptionSegment]] = None
    """Segments of the transcribed text and their corresponding details."""

    words: Optional[list[TranscriptionWord]] = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: Optional[list[TranscriptionSegment]] = None

Segments of the transcribed text and their corresponding details.

text instance-attribute

text: str

The transcribed text.

words class-attribute instance-attribute

Extracted words and their corresponding timestamps.

TranscriptionSegment

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""

    avg_logprob: float
    """Average logprob of the segment.

    If the value is lower than -1, consider the logprobs failed.
    """

    compression_ratio: float
    """Compression ratio of the segment.

    If the value is greater than 2.4, consider the compression failed.
    """

    end: float
    """End time of the segment in seconds."""

    no_speech_prob: float
    """Probability of no speech in the segment.

    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
    this segment silent.
    """

    seek: int
    """Seek offset of the segment."""

    start: float
    """Start time of the segment in seconds."""

    temperature: float
    """Temperature parameter used for generating the segment."""

    text: str
    """Text content of the segment."""

    tokens: list[int]
    """Array of token IDs for the text content."""

avg_logprob instance-attribute

avg_logprob: float

Average logprob of the segment.

If the value is lower than -1, consider the logprobs failed.

compression_ratio instance-attribute

compression_ratio: float

Compression ratio of the segment.

If the value is greater than 2.4, consider the compression failed.

end instance-attribute

end: float

End time of the segment in seconds.

id instance-attribute

id: int

Unique identifier of the segment.

no_speech_prob instance-attribute

no_speech_prob: float

Probability of no speech in the segment.

If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.

seek instance-attribute

seek: int

Seek offset of the segment.

start instance-attribute

start: float

Start time of the segment in seconds.

temperature instance-attribute

temperature: float

Temperature parameter used for generating the segment.

text instance-attribute

text: str

Text content of the segment.

tokens instance-attribute

tokens: list[int]

Array of token IDs for the text content.

TranscriptionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
    object: Literal["transcription.chunk"] = "transcription.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranscriptionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsc-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["transcription.chunk"] = (
    "transcription.chunk"
)

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

TranscriptionWord

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionWord(OpenAIBaseModel):
    end: float
    """End time of the word in seconds."""

    start: float
    """Start time of the word in seconds."""

    word: str
    """The text content of the word."""

end instance-attribute

end: float

End time of the word in seconds.

start instance-attribute

start: float

Start time of the word in seconds.

word instance-attribute

word: str

The text content of the word.

TranslationRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranslation

    file: UploadFile
    """
    The audio file object (not file name) to translate, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: Optional[str] = None
    """ID of the model to use.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    # TODO support additional sampling parameters
    # --8<-- [start:translation-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """
    # --8<-- [end:translation-sampling-params]

    # --8<-- [start:translation-extra-params]
    language: Optional[str] = None
    """The language of the input audio we translate from.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy.
    """

    stream: Optional[bool] = False
    """Custom field not present in the original OpenAI definition. When set, 
    it will enable output to be streamed in a similar fashion as the Chat
    Completion endpoint. 
    """
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False
    # --8<-- [end:translation-extra-params]

    # Default sampling parameters for translation requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "temperature": 0,
    }

    def to_sampling_params(
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:

        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

        return SamplingParams.from_optional(temperature=temperature,
                                            max_tokens=max_tokens,
                                            output_kind=RequestOutputKind.DELTA
                                            if self.stream \
                                            else RequestOutputKind.FINAL_ONLY)

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {'temperature': 0}

file instance-attribute

file: UploadFile

The audio file object (not file name) to translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

language class-attribute instance-attribute

language: Optional[str] = None

The language of the input audio we translate from.

Supplying the input language in ISO-639-1 format will improve accuracy.

model class-attribute instance-attribute

model: Optional[str] = None

ID of the model to use.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

stream class-attribute instance-attribute

stream: Optional[bool] = False

Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: Optional[bool] = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: Optional[bool] = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: Optional[dict] = None) -> SamplingParams:

    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}
    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

    return SamplingParams.from_optional(temperature=temperature,
                                        max_tokens=max_tokens,
                                        output_kind=RequestOutputKind.DELTA
                                        if self.stream \
                                        else RequestOutputKind.FINAL_ONLY)

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

TranslationResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponse(OpenAIBaseModel):
    text: str
    """The translated text."""

text instance-attribute

text: str

The translated text.

TranslationResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

TranslationResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The translated text."""

    segments: Optional[list[TranslationSegment]] = None
    """Segments of the translated text and their corresponding details."""

    words: Optional[list[TranslationWord]] = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: Optional[list[TranslationSegment]] = None

Segments of the translated text and their corresponding details.

text instance-attribute

text: str

The translated text.

words class-attribute instance-attribute

words: Optional[list[TranslationWord]] = None

Extracted words and their corresponding timestamps.

TranslationSegment

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""

    avg_logprob: float
    """Average logprob of the segment.

    If the value is lower than -1, consider the logprobs failed.
    """

    compression_ratio: float
    """Compression ratio of the segment.

    If the value is greater than 2.4, consider the compression failed.
    """

    end: float
    """End time of the segment in seconds."""

    no_speech_prob: float
    """Probability of no speech in the segment.

    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
    this segment silent.
    """

    seek: int
    """Seek offset of the segment."""

    start: float
    """Start time of the segment in seconds."""

    temperature: float
    """Temperature parameter used for generating the segment."""

    text: str
    """Text content of the segment."""

    tokens: list[int]
    """Array of token IDs for the text content."""

avg_logprob instance-attribute

avg_logprob: float

Average logprob of the segment.

If the value is lower than -1, consider the logprobs failed.

compression_ratio instance-attribute

compression_ratio: float

Compression ratio of the segment.

If the value is greater than 2.4, consider the compression failed.

end instance-attribute

end: float

End time of the segment in seconds.

id instance-attribute

id: int

Unique identifier of the segment.

no_speech_prob instance-attribute

no_speech_prob: float

Probability of no speech in the segment.

If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.

seek instance-attribute

seek: int

Seek offset of the segment.

start instance-attribute

start: float

Start time of the segment in seconds.

temperature instance-attribute

temperature: float

Temperature parameter used for generating the segment.

text instance-attribute

text: str

Text content of the segment.

tokens instance-attribute

tokens: list[int]

Array of token IDs for the text content.

TranslationStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
    object: Literal["translation.chunk"] = "translation.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranslationResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['translation.chunk'] = 'translation.chunk'

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

TranslationWord

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationWord(OpenAIBaseModel):
    end: float
    """End time of the word in seconds."""

    start: float
    """Start time of the word in seconds."""

    word: str
    """The text content of the word."""

end instance-attribute

end: float

End time of the word in seconds.

start instance-attribute

start: float

Start time of the word in seconds.

word instance-attribute

word: str

The text content of the word.

UnloadLoRAAdapterRequest

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class UnloadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_int_id: Optional[int] = Field(default=None)

lora_int_id class-attribute instance-attribute

lora_int_id: Optional[int] = Field(default=None)

lora_name instance-attribute

lora_name: str

UsageInfo

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class UsageInfo(OpenAIBaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0
    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None

completion_tokens class-attribute instance-attribute

completion_tokens: Optional[int] = 0

prompt_tokens class-attribute instance-attribute

prompt_tokens: int = 0

prompt_tokens_details class-attribute instance-attribute

prompt_tokens_details: Optional[PromptTokenUsageInfo] = None

total_tokens class-attribute instance-attribute

total_tokens: int = 0

get_logits_processors

get_logits_processors(
    processors: Optional[LogitsProcessors],
    pattern: Optional[str],
) -> Optional[list[Any]]
Source code in vllm/entrypoints/openai/protocol.py
def get_logits_processors(processors: Optional[LogitsProcessors],
                          pattern: Optional[str]) -> Optional[list[Any]]:
    if processors and pattern:
        logits_processors = []
        for processor in processors:
            qualname = processor if isinstance(processor,
                                               str) else processor.qualname
            if not re.match(pattern, qualname):
                raise ValueError(
                    f"Logits processor '{qualname}' is not allowed by this "
                    "server. See --logits-processor-pattern engine argument "
                    "for more information.")
            try:
                logits_processor = resolve_obj_by_qualname(qualname)
            except Exception as e:
                raise ValueError(
                    f"Logits processor '{qualname}' could not be resolved: {e}"
                ) from e
            if isinstance(processor, LogitsProcessorConstructor):
                logits_processor = logits_processor(*processor.args or [],
                                                    **processor.kwargs or {})
            logits_processors.append(logits_processor)
        return logits_processors
    elif processors:
        raise ValueError(
            "The `logits_processors` argument is not supported by this "
            "server. See --logits-processor-pattern engine argugment "
            "for more information.")
    return None