Skip to content

vllm.v1.outputs

EMPTY_MODEL_RUNNER_OUTPUT module-attribute

EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
    req_ids=[],
    req_id_to_index={},
    sampled_token_ids=[],
    spec_token_ids=None,
    logprobs=None,
    prompt_logprobs_dict={},
    pooler_output=[],
    finished_sending=None,
    finished_recving=None,
    num_nans_in_logits=None,
)

LogprobsLists

Bases: NamedTuple

Source code in vllm/v1/outputs.py
class LogprobsLists(NamedTuple):

    # [num_reqs, max_num_logprobs + 1]
    logprob_token_ids: list[list[int]]
    # [num_reqs, max_num_logprobs + 1]
    logprobs: list[list[float]]
    # [num_reqs]
    sampled_token_ranks: list[int]

    def slice(self, start: int, end: int):
        return LogprobsLists(
            self.logprob_token_ids[start:end],
            self.logprobs[start:end],
            self.sampled_token_ranks[start:end],
        )

logprob_token_ids instance-attribute

logprob_token_ids: list[list[int]]

logprobs instance-attribute

logprobs: list[list[float]]

sampled_token_ranks instance-attribute

sampled_token_ranks: list[int]

slice

slice(start: int, end: int)
Source code in vllm/v1/outputs.py
def slice(self, start: int, end: int):
    return LogprobsLists(
        self.logprob_token_ids[start:end],
        self.logprobs[start:end],
        self.sampled_token_ranks[start:end],
    )

LogprobsTensors

Bases: NamedTuple

Source code in vllm/v1/outputs.py
class LogprobsTensors(NamedTuple):

    # [num_reqs, max_num_logprobs + 1]
    logprob_token_ids: torch.Tensor
    # [num_reqs, max_num_logprobs + 1]
    logprobs: torch.Tensor
    # [num_reqs]
    selected_token_ranks: torch.Tensor

    def tolists(self):
        return LogprobsLists(
            self.logprob_token_ids.tolist(),
            self.logprobs.tolist(),
            self.selected_token_ranks.tolist(),
        )

    @staticmethod
    def empty_cpu(num_positions: int,
                  num_tokens_per_position: int) -> "LogprobsTensors":
        """Create empty LogprobsTensors on CPU."""

        logprob_token_ids = torch.empty(
            (num_positions, num_tokens_per_position),
            dtype=torch.int32,
            device="cpu")
        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
        selected_token_ranks = torch.empty(num_positions,
                                           dtype=torch.int32,
                                           device="cpu")
        return LogprobsTensors(
            logprob_token_ids=logprob_token_ids,
            logprobs=logprobs,
            selected_token_ranks=selected_token_ranks,
        )

logprob_token_ids instance-attribute

logprob_token_ids: Tensor

logprobs instance-attribute

logprobs: Tensor

selected_token_ranks instance-attribute

selected_token_ranks: Tensor

empty_cpu staticmethod

empty_cpu(
    num_positions: int, num_tokens_per_position: int
) -> LogprobsTensors

Create empty LogprobsTensors on CPU.

Source code in vllm/v1/outputs.py
@staticmethod
def empty_cpu(num_positions: int,
              num_tokens_per_position: int) -> "LogprobsTensors":
    """Create empty LogprobsTensors on CPU."""

    logprob_token_ids = torch.empty(
        (num_positions, num_tokens_per_position),
        dtype=torch.int32,
        device="cpu")
    logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
    selected_token_ranks = torch.empty(num_positions,
                                       dtype=torch.int32,
                                       device="cpu")
    return LogprobsTensors(
        logprob_token_ids=logprob_token_ids,
        logprobs=logprobs,
        selected_token_ranks=selected_token_ranks,
    )

tolists

tolists()
Source code in vllm/v1/outputs.py
def tolists(self):
    return LogprobsLists(
        self.logprob_token_ids.tolist(),
        self.logprobs.tolist(),
        self.selected_token_ranks.tolist(),
    )

ModelRunnerOutput dataclass

Source code in vllm/v1/outputs.py
@dataclass
class ModelRunnerOutput:

    # [num_reqs]
    req_ids: list[str]
    # req_id -> index
    req_id_to_index: dict[str, int]

    # num_reqs x num_generated_tokens
    # num_generated_tokens is the number of tokens
    # generated in the current step. It can be different for
    # each request due to speculative/jump decoding.
    sampled_token_ids: list[list[int]]

    # num_reqs x num_spec_tokens
    spec_token_ids: Optional[list[list[int]]]

    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs]
    logprobs: Optional[LogprobsLists]

    # req_id -> (token_ids, logprobs, ranks)
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len]
    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]

    # [num_reqs, hidden_size]
    pooler_output: list[Optional[torch.Tensor]]

    # [req_ids]
    finished_sending: Optional[set[str]] = None
    finished_recving: Optional[set[str]] = None

    # req_id -> num_nans_in_logits
    num_nans_in_logits: Optional[dict[str, int]] = None

finished_recving class-attribute instance-attribute

finished_recving: Optional[set[str]] = None

finished_sending class-attribute instance-attribute

finished_sending: Optional[set[str]] = None

logprobs instance-attribute

num_nans_in_logits class-attribute instance-attribute

num_nans_in_logits: Optional[dict[str, int]] = None

pooler_output instance-attribute

pooler_output: list[Optional[Tensor]]

prompt_logprobs_dict instance-attribute

prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]

req_id_to_index instance-attribute

req_id_to_index: dict[str, int]

req_ids instance-attribute

req_ids: list[str]

sampled_token_ids instance-attribute

sampled_token_ids: list[list[int]]

spec_token_ids instance-attribute

spec_token_ids: Optional[list[list[int]]]

__init__

__init__(
    req_ids: list[str],
    req_id_to_index: dict[str, int],
    sampled_token_ids: list[list[int]],
    spec_token_ids: Optional[list[list[int]]],
    logprobs: Optional[LogprobsLists],
    prompt_logprobs_dict: dict[
        str, Optional[LogprobsTensors]
    ],
    pooler_output: list[Optional[Tensor]],
    finished_sending: Optional[set[str]] = None,
    finished_recving: Optional[set[str]] = None,
    num_nans_in_logits: Optional[dict[str, int]] = None,
) -> None

SamplerOutput dataclass

Source code in vllm/v1/outputs.py
@dataclass
class SamplerOutput:

    # [num_reqs, max_num_generated_tokens]
    # Different requests can have different number of generated tokens.
    # All requests are padded to max_num_generated_tokens.
    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
    sampled_token_ids: torch.Tensor
    logprobs_tensors: Optional[LogprobsTensors]

logprobs_tensors instance-attribute

logprobs_tensors: Optional[LogprobsTensors]

sampled_token_ids instance-attribute

sampled_token_ids: Tensor

__init__

__init__(
    sampled_token_ids: Tensor,
    logprobs_tensors: Optional[LogprobsTensors],
) -> None