Skip to content

vllm.v1.core.sched.output

CachedRequestData dataclass

Source code in vllm/v1/core/sched/output.py
@dataclass
class CachedRequestData:

    req_ids: list[str]
    # If resumed_from_preemption is False, new_block_ids will be appended to
    # the request's block IDs. If True, new_block_ids will be used as the
    # request's block IDs instead of appending to the existing block IDs.
    resumed_from_preemption: list[bool]
    # NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
    # When PP is not used, new_token_ids will be empty.
    new_token_ids: list[list[int]]
    new_block_ids: list[tuple[list[int], ...]]
    num_computed_tokens: list[int]

    @property
    def num_reqs(self) -> int:
        return len(self.req_ids)

    @classmethod
    def make_empty(cls) -> CachedRequestData:
        return cls(
            req_ids=[],
            resumed_from_preemption=[],
            new_token_ids=[],
            new_block_ids=[],
            num_computed_tokens=[],
        )

new_block_ids instance-attribute

new_block_ids: list[tuple[list[int], ...]]

new_token_ids instance-attribute

new_token_ids: list[list[int]]

num_computed_tokens instance-attribute

num_computed_tokens: list[int]

num_reqs property

num_reqs: int

req_ids instance-attribute

req_ids: list[str]

resumed_from_preemption instance-attribute

resumed_from_preemption: list[bool]

__init__

__init__(
    req_ids: list[str],
    resumed_from_preemption: list[bool],
    new_token_ids: list[list[int]],
    new_block_ids: list[tuple[list[int], ...]],
    num_computed_tokens: list[int],
) -> None

make_empty classmethod

make_empty() -> CachedRequestData
Source code in vllm/v1/core/sched/output.py
@classmethod
def make_empty(cls) -> CachedRequestData:
    return cls(
        req_ids=[],
        resumed_from_preemption=[],
        new_token_ids=[],
        new_block_ids=[],
        num_computed_tokens=[],
    )

NewRequestData dataclass

Source code in vllm/v1/core/sched/output.py
@dataclass
class NewRequestData:

    req_id: str
    prompt_token_ids: list[int]
    mm_inputs: list[MultiModalKwargs]
    mm_hashes: list[str]
    mm_positions: list[PlaceholderRange]
    sampling_params: Optional[SamplingParams]
    pooling_params: Optional[PoolingParams]
    block_ids: tuple[list[int], ...]
    num_computed_tokens: int
    lora_request: Optional[LoRARequest]

    @classmethod
    def from_request(
        cls,
        request: Request,
        block_ids: tuple[list[int], ...],
    ) -> NewRequestData:
        return cls(
            req_id=request.request_id,
            prompt_token_ids=request.prompt_token_ids,
            mm_inputs=request.mm_inputs,
            mm_hashes=request.mm_hashes,
            mm_positions=request.mm_positions,
            sampling_params=request.sampling_params,
            pooling_params=request.pooling_params,
            block_ids=block_ids,
            num_computed_tokens=request.num_computed_tokens,
            lora_request=request.lora_request,
        )

    def __repr__(self):
        return (f"NewRequestData("
                f"req_id={self.req_id},"
                f"prompt_token_ids={self.prompt_token_ids},"
                f"mm_inputs={self.mm_inputs},"
                f"mm_hashes={self.mm_hashes},"
                f"mm_positions={self.mm_positions},"
                f"sampling_params={self.sampling_params},"
                f"block_ids={self.block_ids},"
                f"num_computed_tokens={self.num_computed_tokens},"
                f"lora_request={self.lora_request}"
                ")")

    # Version of __repr__ with the prompt data obfuscated
    def anon_repr(self):
        return (f"NewRequestData("
                f"req_id={self.req_id},"
                f"prompt_token_ids_len={len(self.prompt_token_ids)},"
                f"mm_inputs={self.mm_inputs},"
                f"mm_hashes={self.mm_hashes},"
                f"mm_positions={self.mm_positions},"
                f"sampling_params={self.sampling_params},"
                f"block_ids={self.block_ids},"
                f"num_computed_tokens={self.num_computed_tokens},"
                f"lora_request={self.lora_request}"
                ")")

block_ids instance-attribute

block_ids: tuple[list[int], ...]

lora_request instance-attribute

lora_request: Optional[LoRARequest]

mm_hashes instance-attribute

mm_hashes: list[str]

mm_inputs instance-attribute

mm_inputs: list[MultiModalKwargs]

mm_positions instance-attribute

mm_positions: list[PlaceholderRange]

num_computed_tokens instance-attribute

num_computed_tokens: int

pooling_params instance-attribute

pooling_params: Optional[PoolingParams]

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

req_id instance-attribute

req_id: str

sampling_params instance-attribute

sampling_params: Optional[SamplingParams]

__init__

__init__(
    req_id: str,
    prompt_token_ids: list[int],
    mm_inputs: list[MultiModalKwargs],
    mm_hashes: list[str],
    mm_positions: list[PlaceholderRange],
    sampling_params: Optional[SamplingParams],
    pooling_params: Optional[PoolingParams],
    block_ids: tuple[list[int], ...],
    num_computed_tokens: int,
    lora_request: Optional[LoRARequest],
) -> None

__repr__

__repr__()
Source code in vllm/v1/core/sched/output.py
def __repr__(self):
    return (f"NewRequestData("
            f"req_id={self.req_id},"
            f"prompt_token_ids={self.prompt_token_ids},"
            f"mm_inputs={self.mm_inputs},"
            f"mm_hashes={self.mm_hashes},"
            f"mm_positions={self.mm_positions},"
            f"sampling_params={self.sampling_params},"
            f"block_ids={self.block_ids},"
            f"num_computed_tokens={self.num_computed_tokens},"
            f"lora_request={self.lora_request}"
            ")")

anon_repr

anon_repr()
Source code in vllm/v1/core/sched/output.py
def anon_repr(self):
    return (f"NewRequestData("
            f"req_id={self.req_id},"
            f"prompt_token_ids_len={len(self.prompt_token_ids)},"
            f"mm_inputs={self.mm_inputs},"
            f"mm_hashes={self.mm_hashes},"
            f"mm_positions={self.mm_positions},"
            f"sampling_params={self.sampling_params},"
            f"block_ids={self.block_ids},"
            f"num_computed_tokens={self.num_computed_tokens},"
            f"lora_request={self.lora_request}"
            ")")

from_request classmethod

from_request(
    request: Request, block_ids: tuple[list[int], ...]
) -> NewRequestData
Source code in vllm/v1/core/sched/output.py
@classmethod
def from_request(
    cls,
    request: Request,
    block_ids: tuple[list[int], ...],
) -> NewRequestData:
    return cls(
        req_id=request.request_id,
        prompt_token_ids=request.prompt_token_ids,
        mm_inputs=request.mm_inputs,
        mm_hashes=request.mm_hashes,
        mm_positions=request.mm_positions,
        sampling_params=request.sampling_params,
        pooling_params=request.pooling_params,
        block_ids=block_ids,
        num_computed_tokens=request.num_computed_tokens,
        lora_request=request.lora_request,
    )

SchedulerOutput dataclass

Source code in vllm/v1/core/sched/output.py
@dataclass
class SchedulerOutput:

    # list of the requests that are scheduled for the first time.
    # We cache the request's data in each worker process, so that we don't
    # need to re-send it every scheduling step.
    scheduled_new_reqs: list[NewRequestData]
    # list of the requests that have been scheduled before.
    # Since the request's data is already cached in the worker processes,
    # we only send the diff to minimize the communication cost.
    scheduled_cached_reqs: CachedRequestData

    # req_id -> num_scheduled_tokens
    # Number of tokens scheduled for each request.
    num_scheduled_tokens: dict[str, int]
    # Total number of tokens scheduled for all requests.
    # Equal to sum(num_scheduled_tokens.values())
    total_num_scheduled_tokens: int
    # req_id -> spec_token_ids
    # If a request does not have any spec decode tokens, it will not be
    # included in the dictionary.
    scheduled_spec_decode_tokens: dict[str, list[int]]
    # req_id -> encoder input indices that need processing.
    # E.g., if a request has [0, 1], it could mean the vision encoder needs
    # to process that the request's 0-th and 1-th images in the current step.
    scheduled_encoder_inputs: dict[str, list[int]]
    # Number of common prefix blocks for all requests in each KV cache group.
    # This can be used for cascade attention.
    num_common_prefix_blocks: list[int]

    # Request IDs that are finished in between the previous and the current
    # steps. This is used to notify the workers about the finished requests
    # so that they can free the cached states for those requests.
    finished_req_ids: set[str]
    # list of (req_id, encoder_input_index) tuples.
    # Used to free the encoder cache.
    free_encoder_input_ids: list[tuple[str, int]]

    # Dict of request ids to their index within the batch
    # for filling the next token bitmask
    structured_output_request_ids: dict[str, int]
    # the bitmask for the whole batch
    grammar_bitmask: Optional[npt.NDArray[np.int32]]

    # KV Cache Connector metadata.
    kv_connector_metadata: Optional[KVConnectorMetadata] = None

finished_req_ids instance-attribute

finished_req_ids: set[str]

free_encoder_input_ids instance-attribute

free_encoder_input_ids: list[tuple[str, int]]

grammar_bitmask instance-attribute

grammar_bitmask: Optional[NDArray[int32]]

kv_connector_metadata class-attribute instance-attribute

kv_connector_metadata: Optional[KVConnectorMetadata] = None

num_common_prefix_blocks instance-attribute

num_common_prefix_blocks: list[int]

num_scheduled_tokens instance-attribute

num_scheduled_tokens: dict[str, int]

scheduled_cached_reqs instance-attribute

scheduled_cached_reqs: CachedRequestData

scheduled_encoder_inputs instance-attribute

scheduled_encoder_inputs: dict[str, list[int]]

scheduled_new_reqs instance-attribute

scheduled_new_reqs: list[NewRequestData]

scheduled_spec_decode_tokens instance-attribute

scheduled_spec_decode_tokens: dict[str, list[int]]

structured_output_request_ids instance-attribute

structured_output_request_ids: dict[str, int]

total_num_scheduled_tokens instance-attribute

total_num_scheduled_tokens: int

__init__

__init__(
    scheduled_new_reqs: list[NewRequestData],
    scheduled_cached_reqs: CachedRequestData,
    num_scheduled_tokens: dict[str, int],
    total_num_scheduled_tokens: int,
    scheduled_spec_decode_tokens: dict[str, list[int]],
    scheduled_encoder_inputs: dict[str, list[int]],
    num_common_prefix_blocks: list[int],
    finished_req_ids: set[str],
    free_encoder_input_ids: list[tuple[str, int]],
    structured_output_request_ids: dict[str, int],
    grammar_bitmask: Optional[NDArray[int32]],
    kv_connector_metadata: Optional[
        KVConnectorMetadata
    ] = None,
) -> None