Skip to content

vllm.worker.cpu_pooling_model_runner

CPUPoolingModelRunner

Bases: CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]

Source code in vllm/worker/cpu_pooling_model_runner.py
class CPUPoolingModelRunner(
        CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
    _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
        ModelInputForCPUWithPoolingMetadata)
    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder

    @torch.inference_mode()
    def execute_model(
        self,
        model_input: ModelInputForCPUWithPoolingMetadata,
        kv_caches: List[torch.Tensor],
        intermediate_tensors: Optional[IntermediateTensors] = None,
        num_steps: int = 1,
    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
        if num_steps > 1:
            raise ValueError(
                "CPU worker does not support multi-step execution.")

        model_executable = self.model
        cross_enc_kwargs = {}
        if model_input.token_type_ids is not None:
            cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids
        execute_model_kwargs = {
            "input_ids":
            model_input.input_tokens,
            "positions":
            model_input.input_positions,
            **MultiModalKwargs.as_kwargs(
                model_input.multi_modal_kwargs or {},
                device=self.device,
            ),
            **cross_enc_kwargs,
            "intermediate_tensors":
            intermediate_tensors,
        }

        with set_forward_context(model_input.attn_metadata, self.vllm_config,
                                 model_input.virtual_engine):
            hidden_states = model_executable(**execute_model_kwargs)

        # Only perform pooling in the driver worker.
        if not self.is_driver_worker:
            return []

        return [
            self.model.pooler(hidden_states=hidden_states,
                              pooling_metadata=model_input.pooling_metadata)
        ]

    def make_model_input_from_broadcasted_tensor_dict(
            self,
            tensor_dict: Dict[str,
                              Any]) -> ModelInputForCPUWithPoolingMetadata:
        return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
            tensor_dict,
            attn_backend=self.attn_backend,
        )

    def prepare_model_input(
        self,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
        virtual_engine: int = 0,
        finished_requests_ids: Optional[List[str]] = None
    ) -> ModelInputForCPUWithPoolingMetadata:
        assert seq_group_metadata_list is not None
        model_input = self._prepare_model_input_tensors(
            seq_group_metadata_list, finished_requests_ids)
        # Prepare PoolingMetadata.
        assert model_input.seq_lens is not None
        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
                                                 model_input.seq_lens)

        return dataclasses.replace(model_input,
                                   virtual_engine=virtual_engine,
                                   pooling_metadata=pooling_metadata)

    def _prepare_pooling(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
        prompt_lens: List[int],
    ) -> PoolingMetadata:
        """Prepare PoolingMetadata for the sequence group metadata list."""
        seq_groups: List[Tuple[List[int], PoolingParams]] = []
        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
            seq_ids = list(seq_group_metadata.seq_data.keys())
            pooling_params = seq_group_metadata.pooling_params
            seq_groups.append((seq_ids, pooling_params))

        seq_data: Dict[int, SequenceData] = {}
        for seq_group_metadata in seq_group_metadata_list:
            seq_data.update(seq_group_metadata.seq_data)

        pooling_metadata = PoolingMetadata(
            seq_groups=seq_groups,
            seq_data=seq_data,
            prompt_lens=prompt_lens,
        )

        return pooling_metadata

_builder_cls class-attribute instance-attribute

_model_input_cls class-attribute instance-attribute

_prepare_pooling

_prepare_pooling(
    seq_group_metadata_list: List[SequenceGroupMetadata],
    prompt_lens: List[int],
) -> PoolingMetadata

Prepare PoolingMetadata for the sequence group metadata list.

Source code in vllm/worker/cpu_pooling_model_runner.py
def _prepare_pooling(
    self,
    seq_group_metadata_list: List[SequenceGroupMetadata],
    prompt_lens: List[int],
) -> PoolingMetadata:
    """Prepare PoolingMetadata for the sequence group metadata list."""
    seq_groups: List[Tuple[List[int], PoolingParams]] = []
    for i, seq_group_metadata in enumerate(seq_group_metadata_list):
        seq_ids = list(seq_group_metadata.seq_data.keys())
        pooling_params = seq_group_metadata.pooling_params
        seq_groups.append((seq_ids, pooling_params))

    seq_data: Dict[int, SequenceData] = {}
    for seq_group_metadata in seq_group_metadata_list:
        seq_data.update(seq_group_metadata.seq_data)

    pooling_metadata = PoolingMetadata(
        seq_groups=seq_groups,
        seq_data=seq_data,
        prompt_lens=prompt_lens,
    )

    return pooling_metadata

execute_model

execute_model(
    model_input: ModelInputForCPUWithPoolingMetadata,
    kv_caches: List[Tensor],
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    num_steps: int = 1,
) -> Optional[
    Union[List[PoolerOutput], IntermediateTensors]
]
Source code in vllm/worker/cpu_pooling_model_runner.py
@torch.inference_mode()
def execute_model(
    self,
    model_input: ModelInputForCPUWithPoolingMetadata,
    kv_caches: List[torch.Tensor],
    intermediate_tensors: Optional[IntermediateTensors] = None,
    num_steps: int = 1,
) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
    if num_steps > 1:
        raise ValueError(
            "CPU worker does not support multi-step execution.")

    model_executable = self.model
    cross_enc_kwargs = {}
    if model_input.token_type_ids is not None:
        cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids
    execute_model_kwargs = {
        "input_ids":
        model_input.input_tokens,
        "positions":
        model_input.input_positions,
        **MultiModalKwargs.as_kwargs(
            model_input.multi_modal_kwargs or {},
            device=self.device,
        ),
        **cross_enc_kwargs,
        "intermediate_tensors":
        intermediate_tensors,
    }

    with set_forward_context(model_input.attn_metadata, self.vllm_config,
                             model_input.virtual_engine):
        hidden_states = model_executable(**execute_model_kwargs)

    # Only perform pooling in the driver worker.
    if not self.is_driver_worker:
        return []

    return [
        self.model.pooler(hidden_states=hidden_states,
                          pooling_metadata=model_input.pooling_metadata)
    ]

make_model_input_from_broadcasted_tensor_dict

make_model_input_from_broadcasted_tensor_dict(
    tensor_dict: Dict[str, Any],
) -> ModelInputForCPUWithPoolingMetadata
Source code in vllm/worker/cpu_pooling_model_runner.py
def make_model_input_from_broadcasted_tensor_dict(
        self,
        tensor_dict: Dict[str,
                          Any]) -> ModelInputForCPUWithPoolingMetadata:
    return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
        tensor_dict,
        attn_backend=self.attn_backend,
    )

prepare_model_input

prepare_model_input(
    seq_group_metadata_list: Optional[
        List[SequenceGroupMetadata]
    ],
    virtual_engine: int = 0,
    finished_requests_ids: Optional[List[str]] = None,
) -> ModelInputForCPUWithPoolingMetadata
Source code in vllm/worker/cpu_pooling_model_runner.py
def prepare_model_input(
    self,
    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
    virtual_engine: int = 0,
    finished_requests_ids: Optional[List[str]] = None
) -> ModelInputForCPUWithPoolingMetadata:
    assert seq_group_metadata_list is not None
    model_input = self._prepare_model_input_tensors(
        seq_group_metadata_list, finished_requests_ids)
    # Prepare PoolingMetadata.
    assert model_input.seq_lens is not None
    pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
                                             model_input.seq_lens)

    return dataclasses.replace(model_input,
                               virtual_engine=virtual_engine,
                               pooling_metadata=pooling_metadata)

ModelInputForCPUWithPoolingMetadata dataclass

Bases: ModelInputForCPU

Used by the CPUPoolingModelRunner.

Source code in vllm/worker/cpu_pooling_model_runner.py
@dataclasses.dataclass(frozen=True)
class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
    """
    Used by the CPUPoolingModelRunner.
    """
    pooling_metadata: Optional["PoolingMetadata"] = None

pooling_metadata class-attribute instance-attribute

pooling_metadata: Optional[PoolingMetadata] = None

__init__

__init__(
    input_tokens: Optional[Tensor] = None,
    input_positions: Optional[Tensor] = None,
    token_type_ids: Optional[Tensor] = None,
    attn_metadata: Optional[AttentionMetadata] = None,
    multi_modal_kwargs: Optional[
        BatchedTensorInputs
    ] = None,
    virtual_engine: Optional[int] = None,
    seq_lens: Optional[List[int]] = None,
    query_lens: Optional[List[int]] = None,
    lora_mapping: Optional[LoRAMapping] = None,
    lora_requests: Optional[Set[LoRARequest]] = None,
    pooling_metadata: Optional[PoolingMetadata] = None,
) -> None