Skip to content

vllm.v1.engine

Modules:

Name Description
async_llm
coordinator
core
core_client
detokenizer
exceptions
llm_engine
logprobs
mm_input_cache
output_processor
parallel_sampling
processor
utils

FINISH_REASON_STRINGS module-attribute

FINISH_REASON_STRINGS = ('stop', 'length', 'abort')

EngineCoreEvent

Bases: Struct

A timestamped engine core event associated with a request.

The timestamp is a monotonic timestamps and is used for by the engine frontend to calculate intervals between engine core events. These timestamps should not be compared with timestamps from other processes.

Source code in vllm/v1/engine/__init__.py
class EngineCoreEvent(msgspec.Struct):
    """A timestamped engine core event associated with a request.

    The timestamp is a monotonic timestamps and is used for by the engine
    frontend to calculate intervals between engine core events. These
    timestamps should not be compared with timestamps from other processes.
    """
    type: EngineCoreEventType
    timestamp: float

    @classmethod
    def new_event(cls,
                  event_type: EngineCoreEventType,
                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
        timestamp = time.monotonic() if timestamp is None else timestamp
        return cls(event_type, timestamp)

timestamp instance-attribute

timestamp: float

type instance-attribute

new_event classmethod

new_event(
    event_type: EngineCoreEventType,
    timestamp: Optional[float] = None,
) -> EngineCoreEvent
Source code in vllm/v1/engine/__init__.py
@classmethod
def new_event(cls,
              event_type: EngineCoreEventType,
              timestamp: Optional[float] = None) -> "EngineCoreEvent":
    timestamp = time.monotonic() if timestamp is None else timestamp
    return cls(event_type, timestamp)

EngineCoreEventType

Bases: IntEnum

The type of engine core request event.

Source code in vllm/v1/engine/__init__.py
class EngineCoreEventType(enum.IntEnum):
    """The type of engine core request event."""
    QUEUED = 1
    SCHEDULED = 2
    PREEMPTED = 3

PREEMPTED class-attribute instance-attribute

PREEMPTED = 3

QUEUED class-attribute instance-attribute

QUEUED = 1

SCHEDULED class-attribute instance-attribute

SCHEDULED = 2

EngineCoreOutput

Bases: Struct

Source code in vllm/v1/engine/__init__.py
class EngineCoreOutput(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]

    request_id: str
    new_token_ids: list[int]

    new_logprobs: Optional[LogprobsLists] = None
    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None

    pooling_output: Optional[torch.Tensor] = None

    finish_reason: Optional[FinishReason] = None
    stop_reason: Union[int, str, None] = None
    events: Optional[list[EngineCoreEvent]] = None
    kv_transfer_params: Optional[dict[str, Any]] = None

    # The number of tokens with prefix cache hits.
    num_cached_tokens: int = 0

    @property
    def finished(self) -> bool:
        return self.finish_reason is not None

events class-attribute instance-attribute

events: Optional[list[EngineCoreEvent]] = None

finish_reason class-attribute instance-attribute

finish_reason: Optional[FinishReason] = None

finished property

finished: bool

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = None

new_logprobs class-attribute instance-attribute

new_logprobs: Optional[LogprobsLists] = None

new_prompt_logprobs_tensors class-attribute instance-attribute

new_prompt_logprobs_tensors: Optional[LogprobsTensors] = (
    None
)

new_token_ids instance-attribute

new_token_ids: list[int]

num_cached_tokens class-attribute instance-attribute

num_cached_tokens: int = 0

pooling_output class-attribute instance-attribute

pooling_output: Optional[Tensor] = None

request_id instance-attribute

request_id: str

stop_reason class-attribute instance-attribute

stop_reason: Union[int, str, None] = None

EngineCoreOutputs

Bases: Struct

Source code in vllm/v1/engine/__init__.py
class EngineCoreOutputs(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]

    #NOTE(Nick): We could consider ways to make this more compact,
    # e.g. columnwise layout

    engine_index: int = 0

    # [num_reqs]
    outputs: list[EngineCoreOutput] = []
    scheduler_stats: Optional[SchedulerStats] = None
    timestamp: float = 0.0

    utility_output: Optional[UtilityOutput] = None
    finished_requests: Optional[set[str]] = None

    # In DP case, used to signal that the current wave of requests
    # has finished and the engines are paused.
    wave_complete: Optional[int] = None
    # In DP case, used to signal that a request was received for an
    # "old" wave, so the next wave needs to be started in other engines.
    start_wave: Optional[int] = None

    def __post_init__(self):
        if self.timestamp == 0.0:
            self.timestamp = time.monotonic()

engine_index class-attribute instance-attribute

engine_index: int = 0

finished_requests class-attribute instance-attribute

finished_requests: Optional[set[str]] = None

outputs class-attribute instance-attribute

outputs: list[EngineCoreOutput] = []

scheduler_stats class-attribute instance-attribute

scheduler_stats: Optional[SchedulerStats] = None

start_wave class-attribute instance-attribute

start_wave: Optional[int] = None

timestamp class-attribute instance-attribute

timestamp: float = 0.0

utility_output class-attribute instance-attribute

utility_output: Optional[UtilityOutput] = None

wave_complete class-attribute instance-attribute

wave_complete: Optional[int] = None

__post_init__

__post_init__()
Source code in vllm/v1/engine/__init__.py
def __post_init__(self):
    if self.timestamp == 0.0:
        self.timestamp = time.monotonic()

EngineCoreRequest

Bases: Struct

Source code in vllm/v1/engine/__init__.py
class EngineCoreRequest(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        omit_defaults=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]

    request_id: str
    prompt_token_ids: list[int]
    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
    mm_hashes: Optional[list[str]]
    mm_placeholders: Optional[list[PlaceholderRange]]
    sampling_params: Optional[SamplingParams]
    pooling_params: Optional[PoolingParams]
    eos_token_id: Optional[int]
    arrival_time: float
    lora_request: Optional[LoRARequest]
    cache_salt: Optional[str]
    data_parallel_rank: Optional[int]

    # Index of the client, used to ensure outputs are sent back to the same
    # client for this request when scaling out the front-end.
    client_index: int = 0

    # Used in DP case to indicate which wave of requests this is expected to
    # belong to, to cover a race condition where the request is sent before
    # a wave finished notification is received.
    current_wave: int = 0
    priority: int = 0

arrival_time instance-attribute

arrival_time: float

cache_salt instance-attribute

cache_salt: Optional[str]

client_index class-attribute instance-attribute

client_index: int = 0

current_wave class-attribute instance-attribute

current_wave: int = 0

data_parallel_rank instance-attribute

data_parallel_rank: Optional[int]

eos_token_id instance-attribute

eos_token_id: Optional[int]

lora_request instance-attribute

lora_request: Optional[LoRARequest]

mm_hashes instance-attribute

mm_hashes: Optional[list[str]]

mm_inputs instance-attribute

mm_placeholders instance-attribute

mm_placeholders: Optional[list[PlaceholderRange]]

pooling_params instance-attribute

pooling_params: Optional[PoolingParams]

priority class-attribute instance-attribute

priority: int = 0

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

request_id instance-attribute

request_id: str

sampling_params instance-attribute

sampling_params: Optional[SamplingParams]

EngineCoreRequestType

Bases: Enum

Request types defined as hex byte strings, so it can be sent over sockets without separate encoding step.

Source code in vllm/v1/engine/__init__.py
class EngineCoreRequestType(enum.Enum):
    """
    Request types defined as hex byte strings, so it can be sent over sockets
    without separate encoding step.
    """
    ADD = b'\x00'
    ABORT = b'\x01'
    START_DP_WAVE = b'\x02'
    UTILITY = b'\x03'
    # Sentinel used within EngineCoreProc.
    EXECUTOR_FAILED = b'\x04'

ABORT class-attribute instance-attribute

ABORT = b'\x01'

ADD class-attribute instance-attribute

ADD = b'\x00'

EXECUTOR_FAILED class-attribute instance-attribute

EXECUTOR_FAILED = b'\x04'

START_DP_WAVE class-attribute instance-attribute

START_DP_WAVE = b'\x02'

UTILITY class-attribute instance-attribute

UTILITY = b'\x03'

FinishReason

Bases: IntEnum

Reason a request finished - stop, length, or abort.

Int rather than Str for more compact serialization.

stop - a stop string was emitted length - max_tokens was consumed, or max_model_len was reached abort - aborted for another reason

Source code in vllm/v1/engine/__init__.py
class FinishReason(enum.IntEnum):
    """
    Reason a request finished - stop, length, or abort.

    Int rather than Str for more compact serialization.

    stop - a stop string was emitted
    length - max_tokens was consumed, or max_model_len was reached
    abort - aborted for another reason

    """
    STOP = 0
    LENGTH = 1
    ABORT = 2

    def __str__(self):
        return FINISH_REASON_STRINGS[self.value]

ABORT class-attribute instance-attribute

ABORT = 2

LENGTH class-attribute instance-attribute

LENGTH = 1

STOP class-attribute instance-attribute

STOP = 0

__str__

__str__()
Source code in vllm/v1/engine/__init__.py
def __str__(self):
    return FINISH_REASON_STRINGS[self.value]

UtilityOutput

Bases: Struct

Source code in vllm/v1/engine/__init__.py
class UtilityOutput(
        msgspec.Struct,
        array_like=True,  # type: ignore[call-arg]
        gc=False):  # type: ignore[call-arg]

    call_id: int

    # Non-None implies the call failed, result should be None.
    failure_message: Optional[str] = None
    result: Any = None

call_id instance-attribute

call_id: int

failure_message class-attribute instance-attribute

failure_message: Optional[str] = None

result class-attribute instance-attribute

result: Any = None