vllm.worker.model_runner_base

T `module-attribute` ¶

T = TypeVar('T', bound='BroadcastableModelInput')

logger `module-attribute` ¶

logger = init_logger(__name__)

BroadcastableModelInput ¶

Bases: ABC

Source code in vllm/worker/model_runner_base.py

class BroadcastableModelInput(ABC):

    @abstractmethod
    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
        """
        Extract broadcastable fields. Override for fields that require some
        custom deserialization.
        """
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def from_broadcasted_tensor_dict(
        cls: Type[T],
        tensor_dict: Dict[str, Any],
        attn_backend: Optional["AttentionBackend"] = None,
    ) -> T:
        """
        Pop fields from the given tensor_dict and populate a new instance of
        BroadcastableModelInput.
        """
        raise NotImplementedError

as_broadcastable_tensor_dict `abstractmethod` ¶

as_broadcastable_tensor_dict() -> Dict[str, Any]

Extract broadcastable fields. Override for fields that require some custom deserialization.

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
    """
    Extract broadcastable fields. Override for fields that require some
    custom deserialization.
    """
    raise NotImplementedError

from_broadcasted_tensor_dict `abstractmethod` `classmethod` ¶

from_broadcasted_tensor_dict(
    tensor_dict: Dict[str, Any],
    attn_backend: Optional[AttentionBackend] = None,
) -> T

Pop fields from the given tensor_dict and populate a new instance of BroadcastableModelInput.

Source code in vllm/worker/model_runner_base.py

@classmethod
@abstractmethod
def from_broadcasted_tensor_dict(
    cls: Type[T],
    tensor_dict: Dict[str, Any],
    attn_backend: Optional["AttentionBackend"] = None,
) -> T:
    """
    Pop fields from the given tensor_dict and populate a new instance of
    BroadcastableModelInput.
    """
    raise NotImplementedError

InputProcessingError ¶

Bases: Exception

This exception is raised when an error occurs preparing the inputs for a single sequence group. This allows the engine to gracefully handle errors with a single sequence group without having to fail the entire batch.

Source code in vllm/worker/model_runner_base.py

class InputProcessingError(Exception):
    """This exception is raised when an error occurs preparing the inputs for
    a single sequence group.
    This allows the engine to gracefully handle errors with a single sequence
    group without having to fail the entire batch.
    """

    def __init__(self, request_id, message):
        """request_id is the id of the offending sequence group"""
        self.request_id = request_id
        self.message = message
        super().__init__(self.message)

    def __str__(self):
        return "Failed to prepare inputs for sequence group with request id: " \
                f"{self.request_id}, Error: {self.message}"

message `instance-attribute` ¶

message = message

request_id `instance-attribute` ¶

request_id = request_id

init ¶

__init__(request_id, message)

request_id is the id of the offending sequence group

Source code in vllm/worker/model_runner_base.py

def __init__(self, request_id, message):
    """request_id is the id of the offending sequence group"""
    self.request_id = request_id
    self.message = message
    super().__init__(self.message)

str ¶

__str__()

Source code in vllm/worker/model_runner_base.py

def __str__(self):
    return "Failed to prepare inputs for sequence group with request id: " \
            f"{self.request_id}, Error: {self.message}"

ModelRunnerBase ¶

Bases: ABC, Generic[T]

Model runner interface that abstracts a particular hardware and/or type of model. Model execution may communicate data with model runners in other processes, but it should not include control plane metadata communication.

Each ModelRunnerBase subclass should define a corresponding ModelRunnerInputBase subclass.

Source code in vllm/worker/model_runner_base.py

class ModelRunnerBase(ABC, Generic[T]):
    """
    Model runner interface that abstracts a particular hardware and/or type of
    model. Model execution may communicate data with model runners in other
    processes, but it should not include control plane metadata communication.

    Each ModelRunnerBase subclass should define a corresponding
    ModelRunnerInputBase subclass.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
    ) -> None:
        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.lora_config = vllm_config.lora_config
        self.load_config = vllm_config.load_config
        self.parallel_config = vllm_config.parallel_config
        self.scheduler_config = vllm_config.scheduler_config
        self.device_config = vllm_config.device_config
        self.speculative_config = vllm_config.speculative_config
        self.prompt_adapter_config = vllm_config.prompt_adapter_config
        self.observability_config = vllm_config.observability_config

    # Map of request_id -> generator used for seeded random sampling
    generators: Dict[str, torch.Generator] = {}

    @abstractmethod
    def make_model_input_from_broadcasted_tensor_dict(
        self,
        tensor_dict: Dict[str, Any],
    ) -> T:
        """
        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
        dict.
        """
        raise NotImplementedError

    @abstractmethod
    def prepare_model_input(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
        virtual_engine: int = 0,
        finished_requests_ids: Optional[List[str]] = None,
    ) -> T:
        """
        Prepare the inputs to ModelRunnerBase.execute_model from an execution
        request. This method may move data to the worker's local device. It is
        not allowed to communicate with other workers or devices.
        """
        raise NotImplementedError

    @abstractmethod
    def get_model(self) -> nn.Module:
        raise NotImplementedError

    def execute_model(
        self,
        model_input: T,
        kv_caches: Optional[List[torch.Tensor]],
        intermediate_tensors: Optional[IntermediateTensors] = None,
        num_steps: int = 1,
        **kwargs,
    ) -> Optional[List[SamplerOutput]]:
        """
        Execute the model on the given input.
        """
        raise NotImplementedError

    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
        """
        Return dict of per-request generators used for random sampling.
        """

        # Clean up generators from completed requests
        if finished_request_ids:
            for request_id in finished_request_ids:
                self.generators.pop(request_id, None)

        return self.generators

cache_config `instance-attribute` ¶

cache_config = cache_config

device_config `instance-attribute` ¶

device_config = device_config

generators `class-attribute` `instance-attribute` ¶

generators: Dict[str, Generator] = {}

load_config `instance-attribute` ¶

load_config = load_config

lora_config `instance-attribute` ¶

lora_config = lora_config

model_config `instance-attribute` ¶

model_config = model_config

observability_config `instance-attribute` ¶

observability_config = observability_config

parallel_config `instance-attribute` ¶

parallel_config = parallel_config

prompt_adapter_config `instance-attribute` ¶

prompt_adapter_config = prompt_adapter_config

scheduler_config `instance-attribute` ¶

scheduler_config = scheduler_config

speculative_config `instance-attribute` ¶

speculative_config = speculative_config

vllm_config `instance-attribute` ¶

vllm_config = vllm_config

init ¶

__init__(vllm_config: VllmConfig) -> None

Source code in vllm/worker/model_runner_base.py

def __init__(
    self,
    vllm_config: VllmConfig,
) -> None:
    self.vllm_config = vllm_config
    self.model_config = vllm_config.model_config
    self.cache_config = vllm_config.cache_config
    self.lora_config = vllm_config.lora_config
    self.load_config = vllm_config.load_config
    self.parallel_config = vllm_config.parallel_config
    self.scheduler_config = vllm_config.scheduler_config
    self.device_config = vllm_config.device_config
    self.speculative_config = vllm_config.speculative_config
    self.prompt_adapter_config = vllm_config.prompt_adapter_config
    self.observability_config = vllm_config.observability_config

execute_model ¶

execute_model(
    model_input: T,
    kv_caches: Optional[List[Tensor]],
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    num_steps: int = 1,
    **kwargs,
) -> Optional[List[SamplerOutput]]

Execute the model on the given input.

Source code in vllm/worker/model_runner_base.py

def execute_model(
    self,
    model_input: T,
    kv_caches: Optional[List[torch.Tensor]],
    intermediate_tensors: Optional[IntermediateTensors] = None,
    num_steps: int = 1,
    **kwargs,
) -> Optional[List[SamplerOutput]]:
    """
    Execute the model on the given input.
    """
    raise NotImplementedError

get_generators ¶

get_generators(
    finished_request_ids: Optional[List[str]] = None,
)

Return dict of per-request generators used for random sampling.

Source code in vllm/worker/model_runner_base.py

def get_generators(self, finished_request_ids: Optional[List[str]] = None):
    """
    Return dict of per-request generators used for random sampling.
    """

    # Clean up generators from completed requests
    if finished_request_ids:
        for request_id in finished_request_ids:
            self.generators.pop(request_id, None)

    return self.generators

get_model `abstractmethod` ¶

get_model() -> Module

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def get_model(self) -> nn.Module:
    raise NotImplementedError

make_model_input_from_broadcasted_tensor_dict `abstractmethod` ¶

make_model_input_from_broadcasted_tensor_dict(
    tensor_dict: Dict[str, Any],
) -> T

Make an instance of a ModelRunnerInputBase from the broadcasted tensor dict.

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def make_model_input_from_broadcasted_tensor_dict(
    self,
    tensor_dict: Dict[str, Any],
) -> T:
    """
    Make an instance of a ModelRunnerInputBase from the broadcasted tensor
    dict.
    """
    raise NotImplementedError

prepare_model_input `abstractmethod` ¶

prepare_model_input(
    seq_group_metadata_list: List[SequenceGroupMetadata],
    virtual_engine: int = 0,
    finished_requests_ids: Optional[List[str]] = None,
) -> T

Prepare the inputs to ModelRunnerBase.execute_model from an execution request. This method may move data to the worker's local device. It is not allowed to communicate with other workers or devices.

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def prepare_model_input(
    self,
    seq_group_metadata_list: List[SequenceGroupMetadata],
    virtual_engine: int = 0,
    finished_requests_ids: Optional[List[str]] = None,
) -> T:
    """
    Prepare the inputs to ModelRunnerBase.execute_model from an execution
    request. This method may move data to the worker's local device. It is
    not allowed to communicate with other workers or devices.
    """
    raise NotImplementedError

ModelRunnerInputBase `dataclass` ¶

Bases: BroadcastableModelInput

Local inputs to each worker's model runner. May contain device-specific data. Different worker backends may have different methods of converting from the global ExecuteModelRequest produced by the LLM engine to the worker-local ModelRunnerInputBase objects.

Model runners that support multi-GPU execution should define a ModelRunnerInputBase subclass, add their required fields, and specify how to serialize/deserialize a ModelInput for broadcast between workers.

Source code in vllm/worker/model_runner_base.py

@dataclasses.dataclass(frozen=True)
class ModelRunnerInputBase(BroadcastableModelInput):
    """Local inputs to each worker's model runner. May contain
    device-specific data. Different worker backends may have different methods
    of converting from the global ExecuteModelRequest produced by the LLM
    engine to the worker-local ModelRunnerInputBase objects.

    Model runners that support multi-GPU execution should define a
    ModelRunnerInputBase subclass, add their required fields, and specify how to
    serialize/deserialize a ModelInput for broadcast between workers.
    """
    pass

init ¶

__init__() -> None

ModelRunnerInputBuilderBase ¶

Bases: ABC, Generic[T]

A builder to create ModelRunnerInputBase objects.

Source code in vllm/worker/model_runner_base.py

class ModelRunnerInputBuilderBase(ABC, Generic[T]):
    """A builder to create ModelRunnerInputBase objects.
  """

    @abstractmethod
    def prepare(self,
                finished_requests_ids: Optional[List[str]] = None) -> None:
        raise NotImplementedError

    @abstractmethod
    def add_seq_group(self, seq_group_metadata):
        """TBA"""
        raise NotImplementedError

    @abstractmethod
    def build(self, *args, **kwargs) -> T:
        """Build metadata with on-device tensors."""
        raise NotImplementedError

add_seq_group `abstractmethod` ¶

add_seq_group(seq_group_metadata)

TBA

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def add_seq_group(self, seq_group_metadata):
    """TBA"""
    raise NotImplementedError

build `abstractmethod` ¶

build(*args, **kwargs) -> T

Build metadata with on-device tensors.

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def build(self, *args, **kwargs) -> T:
    """Build metadata with on-device tensors."""
    raise NotImplementedError

prepare `abstractmethod` ¶

prepare(
    finished_requests_ids: Optional[List[str]] = None,
) -> None

Source code in vllm/worker/model_runner_base.py

@abstractmethod
def prepare(self,
            finished_requests_ids: Optional[List[str]] = None) -> None:
    raise NotImplementedError

ModelRunnerWrapperBase ¶

The whole point of this class is to lazily initialize the model_runner.

Source code in vllm/worker/model_runner_base.py

class ModelRunnerWrapperBase:
    """
    The whole point of this class is to lazily initialize the model_runner.
    """

    def __init__(
        self,
        model_runner: ModelRunnerBase,
    ) -> None:
        self.model_runner: ModelRunnerBase = model_runner

    def __getattr__(self, attr):
        return getattr(self.model_runner, attr)

model_runner `instance-attribute` ¶

model_runner: ModelRunnerBase = model_runner

getattr ¶

__getattr__(attr)

Source code in vllm/worker/model_runner_base.py

def __getattr__(self, attr):
    return getattr(self.model_runner, attr)

init ¶

__init__(model_runner: ModelRunnerBase) -> None

Source code in vllm/worker/model_runner_base.py

def __init__(
    self,
    model_runner: ModelRunnerBase,
) -> None:
    self.model_runner: ModelRunnerBase = model_runner

_add_attn_metadata_broadcastable_dict ¶

_add_attn_metadata_broadcastable_dict(
    tensor_dict: Dict[str, Any],
    attn_metadata: Optional[AttentionMetadata],
) -> None

Helper method to update tensor_dict with broadcastable AttentionMetadata fields.

Source code in vllm/worker/model_runner_base.py

def _add_attn_metadata_broadcastable_dict(
        tensor_dict: Dict[str, Any],
        attn_metadata: Optional["AttentionMetadata"]) -> None:
    """
    Helper method to update tensor_dict with broadcastable
    AttentionMetadata fields.
    """
    if attn_metadata is not None:
        tensor_dict.update(attn_metadata.asdict_zerocopy())

_add_sampling_metadata_broadcastable_dict ¶

_add_sampling_metadata_broadcastable_dict(
    tensor_dict: Dict[str, Any],
    sampling_metadata: Optional[SamplingMetadata],
) -> None

Helper method to update tensor_dict with broadcastable SamplingMetadata fields.

Source code in vllm/worker/model_runner_base.py

def _add_sampling_metadata_broadcastable_dict(
        tensor_dict: Dict[str, Any],
        sampling_metadata: Optional["SamplingMetadata"]) -> None:
    """
    Helper method to update tensor_dict with broadcastable
    SamplingMetadata fields.
    """
    if sampling_metadata is not None:
        tensor_dict["selected_token_indices"] = (
            sampling_metadata.selected_token_indices)

_init_attn_metadata_from_tensor_dict ¶

_init_attn_metadata_from_tensor_dict(
    attn_backend: AttentionBackend,
    tensor_dict: Dict[str, Any],
) -> Dict[str, Any]

Helper method to initialize AttentionMetadata based on an AttentionBackend and broadcastable AttentionMetadata fields.

Source code in vllm/worker/model_runner_base.py

def _init_attn_metadata_from_tensor_dict(
    attn_backend: "AttentionBackend",
    tensor_dict: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Helper method to initialize AttentionMetadata based on an
    AttentionBackend and broadcastable AttentionMetadata fields.
    """
    # Extract the fields used to create AttentionMetadata.
    valid_attn_kwargs = {}
    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
        if field.name in tensor_dict:
            if field.name == "input_positions":
                valid_attn_kwargs[field.name] = tensor_dict[field.name]
            else:
                valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)

    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
    tensor_dict["attn_metadata"] = attn_metadata
    return tensor_dict

_init_frozen_model_input_from_tensor_dict ¶

_init_frozen_model_input_from_tensor_dict(
    frozen_model_input_cls: Type[ModelRunnerInputBase],
    tensor_dict: Dict[str, Any],
) -> Dict[str, Any]

Helper method to initialize a frozen ModelInput based on broadcastable

Source code in vllm/worker/model_runner_base.py

def _init_frozen_model_input_from_tensor_dict(
        frozen_model_input_cls: Type["ModelRunnerInputBase"],
        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Helper method to initialize a frozen ModelInput based on broadcastable
    """
    valid_tensor_kwargs = {}
    for field in dataclasses.fields(frozen_model_input_cls):
        val = tensor_dict.pop(field.name, None)
        if val is not None:
            valid_tensor_kwargs[field.name] = val

    frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs)
    tensor_dict["frozen_model_input"] = frozen_model_input
    return tensor_dict

_init_sampling_metadata_from_tensor_dict ¶

_init_sampling_metadata_from_tensor_dict(
    tensor_dict: Dict[str, Any],
) -> Dict[str, Any]

Helper method to initialize SamplingMetadata based on broadcastable SamplingMetadata fields.

Source code in vllm/worker/model_runner_base.py

def _init_sampling_metadata_from_tensor_dict(  # type: ignore
        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Helper method to initialize SamplingMetadata based on broadcastable
    SamplingMetadata fields.
    """
    from vllm.model_executor import SamplingMetadata

    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
    # An empty SamplingMetadata to signal that the worker should skip
    # sampling.
    if selected_token_indices is not None:
        tensor_dict["sampling_metadata"] = SamplingMetadata(
            seq_groups=None,
            selected_token_indices=selected_token_indices,
            categorized_sample_indices=None,
            num_prompts=0,
        )
    return tensor_dict

vllm.worker.model_runner_base

T module-attribute ¶

logger module-attribute ¶

BroadcastableModelInput ¶

as_broadcastable_tensor_dict abstractmethod ¶

from_broadcasted_tensor_dict abstractmethod classmethod ¶

InputProcessingError ¶

message instance-attribute ¶

request_id instance-attribute ¶

__init__ ¶

__str__ ¶

ModelRunnerBase ¶

cache_config instance-attribute ¶

device_config instance-attribute ¶

generators class-attribute instance-attribute ¶

load_config instance-attribute ¶

lora_config instance-attribute ¶

model_config instance-attribute ¶

observability_config instance-attribute ¶

parallel_config instance-attribute ¶

prompt_adapter_config instance-attribute ¶

scheduler_config instance-attribute ¶

speculative_config instance-attribute ¶

vllm_config instance-attribute ¶

__init__ ¶

execute_model ¶

get_generators ¶

get_model abstractmethod ¶

make_model_input_from_broadcasted_tensor_dict abstractmethod ¶

prepare_model_input abstractmethod ¶

ModelRunnerInputBase dataclass ¶

__init__ ¶

ModelRunnerInputBuilderBase ¶

add_seq_group abstractmethod ¶

build abstractmethod ¶

prepare abstractmethod ¶

ModelRunnerWrapperBase ¶

model_runner instance-attribute ¶

__getattr__ ¶

__init__ ¶

_add_attn_metadata_broadcastable_dict ¶

_add_sampling_metadata_broadcastable_dict ¶

_init_attn_metadata_from_tensor_dict ¶

_init_frozen_model_input_from_tensor_dict ¶

_init_sampling_metadata_from_tensor_dict ¶

T `module-attribute` ¶

logger `module-attribute` ¶

as_broadcastable_tensor_dict `abstractmethod` ¶

from_broadcasted_tensor_dict `abstractmethod` `classmethod` ¶

message `instance-attribute` ¶

request_id `instance-attribute` ¶

init ¶

str ¶

cache_config `instance-attribute` ¶

device_config `instance-attribute` ¶

generators `class-attribute` `instance-attribute` ¶

load_config `instance-attribute` ¶

lora_config `instance-attribute` ¶

model_config `instance-attribute` ¶

observability_config `instance-attribute` ¶

parallel_config `instance-attribute` ¶

prompt_adapter_config `instance-attribute` ¶

scheduler_config `instance-attribute` ¶

speculative_config `instance-attribute` ¶

vllm_config `instance-attribute` ¶

init ¶

get_model `abstractmethod` ¶

make_model_input_from_broadcasted_tensor_dict `abstractmethod` ¶

prepare_model_input `abstractmethod` ¶

ModelRunnerInputBase `dataclass` ¶

init ¶

add_seq_group `abstractmethod` ¶

build `abstractmethod` ¶

prepare `abstractmethod` ¶

model_runner `instance-attribute` ¶

getattr ¶

init ¶