vllm.attention.backends.placeholder_attn

PlaceholderAttentionBackend ¶

Bases: AttentionBackend

Placeholder backend for when no attention is needed.

Source code in vllm/attention/backends/placeholder_attn.py

class PlaceholderAttentionBackend(AttentionBackend):
    """Placeholder backend for when no attention is needed."""

    @staticmethod
    def get_name() -> str:
        return "NO_ATTENTION"

    @staticmethod
    def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
        return PlaceholderAttentionImpl

    @staticmethod
    def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
        return PlaceholderAttentionMetadataBuilder

    @staticmethod
    def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
        return PlaceholderAttentionMetadata

    @staticmethod
    def get_state_cls() -> Type["CommonAttentionState"]:
        return CommonAttentionState

    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
    ) -> Tuple[int, ...]:
        return (1, 1, 1, 1, 1)

    @staticmethod
    def swap_blocks(
        src_kv_cache: torch.Tensor,
        dst_kv_cache: torch.Tensor,
        src_to_dst: torch.Tensor,
    ) -> None:
        return

    @staticmethod
    def copy_blocks(
        kv_caches: List[torch.Tensor],
        src_to_dists: torch.Tensor,
    ) -> None:
        return

copy_blocks `staticmethod` ¶

copy_blocks(
    kv_caches: List[Tensor], src_to_dists: Tensor
) -> None

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def copy_blocks(
    kv_caches: List[torch.Tensor],
    src_to_dists: torch.Tensor,
) -> None:
    return

get_builder_cls `staticmethod` ¶

get_builder_cls() -> Type[
    PlaceholderAttentionMetadataBuilder
]

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]:
    return PlaceholderAttentionMetadataBuilder

get_impl_cls `staticmethod` ¶

get_impl_cls() -> Type[PlaceholderAttentionImpl]

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_impl_cls() -> Type["PlaceholderAttentionImpl"]:
    return PlaceholderAttentionImpl

get_kv_cache_shape `staticmethod` ¶

get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> Tuple[int, ...]

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> Tuple[int, ...]:
    return (1, 1, 1, 1, 1)

get_metadata_cls `staticmethod` ¶

get_metadata_cls() -> Type[PlaceholderAttentionMetadata]

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]:
    return PlaceholderAttentionMetadata

get_name `staticmethod` ¶

get_name() -> str

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_name() -> str:
    return "NO_ATTENTION"

get_state_cls `staticmethod` ¶

get_state_cls() -> Type[CommonAttentionState]

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def get_state_cls() -> Type["CommonAttentionState"]:
    return CommonAttentionState

swap_blocks `staticmethod` ¶

swap_blocks(
    src_kv_cache: Tensor,
    dst_kv_cache: Tensor,
    src_to_dst: Tensor,
) -> None

Source code in vllm/attention/backends/placeholder_attn.py

@staticmethod
def swap_blocks(
    src_kv_cache: torch.Tensor,
    dst_kv_cache: torch.Tensor,
    src_to_dst: torch.Tensor,
) -> None:
    return

PlaceholderAttentionImpl ¶

Bases: AttentionImpl

Source code in vllm/attention/backends/placeholder_attn.py

class PlaceholderAttentionImpl(AttentionImpl):

    def __init__(self, *args, **kwargs) -> None:
        return

    def forward(self, *args, **kwargs) -> torch.Tensor:
        raise NotImplementedError

init ¶

__init__(*args, **kwargs) -> None

Source code in vllm/attention/backends/placeholder_attn.py

def __init__(self, *args, **kwargs) -> None:
    return

forward ¶

forward(*args, **kwargs) -> Tensor

Source code in vllm/attention/backends/placeholder_attn.py

def forward(self, *args, **kwargs) -> torch.Tensor:
    raise NotImplementedError

PlaceholderAttentionMetadata `dataclass` ¶

Bases: AttentionMetadata

Attention metadata for prefill and decode batched together.

Source code in vllm/attention/backends/placeholder_attn.py

@dataclass
class PlaceholderAttentionMetadata(AttentionMetadata):
    """Attention metadata for prefill and decode batched together."""
    # (batch_size,). The sequence length per sequence. Sequence length means
    # the computed tokens + new tokens None if it is a decoding.
    seq_lens: Optional[List[int]]
    # seq_lens stored as a tensor.
    seq_lens_tensor: Optional[torch.Tensor]

    # Maximum sequence length among prefill batch. 0 if there are decoding
    # requests only.
    max_prefill_seq_len: int
    # Maximum sequence length among decode batch. 0 if there are prefill
    # requests only.
    max_decode_seq_len: int
    # (batch_size,) A tensor of context lengths (tokens that are computed
    # so far).
    context_lens_tensor: Optional[torch.Tensor]

    # Whether or not if cuda graph is enabled.
    # Cuda-graph is currently enabled for decoding only.
    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
    use_cuda_graph: bool

    # Maximum query length in the batch.
    max_query_len: Optional[int]

    # Max number of query tokens among request in the batch.
    max_decode_query_len: Optional[int]

    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
    # the batch, used to index into subquery. E.g., if the subquery length
    # is [4, 6], it is [0, 4, 10].
    query_start_loc: Optional[torch.Tensor] = None
    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
    # the batch, used to index into sequence. E.g., if the sequence length is
    # [4, 6], it is [0, 4, 10].
    seq_start_loc: Optional[torch.Tensor] = None

    # Placeholder.
    block_tables: Optional[torch.Tensor] = None

    _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
    _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None

    @property
    def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
        if self.num_prefills == 0:
            return None

        if self._cached_prefill_metadata is not None:
            return self._cached_prefill_metadata

        # Compute some attn_metadata fields which default to None
        query_start_loc = (None if self.query_start_loc is None else
                           self.query_start_loc[:self.num_prefills + 1])
        seq_lens = (None if self.seq_lens is None else
                    self.seq_lens[:self.num_prefills])
        seq_lens_tensor = (None if self.seq_lens_tensor is None else
                           self.seq_lens_tensor[:self.num_prefills])
        seq_start_loc = (None if self.seq_start_loc is None else
                         self.seq_start_loc[:self.num_prefills + 1])
        context_lens_tensor = (None if self.context_lens_tensor is None else
                               self.context_lens_tensor[:self.num_prefills])

        # Placeholders
        slot_mapping = torch.empty(0)
        block_tables = torch.empty(0)

        self._cached_prefill_metadata = PlaceholderAttentionMetadata(
            num_prefills=self.num_prefills,
            num_prefill_tokens=self.num_prefill_tokens,
            num_decode_tokens=0,
            slot_mapping=slot_mapping,
            multi_modal_placeholder_index_maps=self.
            multi_modal_placeholder_index_maps,
            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
            seq_lens=seq_lens,
            seq_lens_tensor=seq_lens_tensor,
            max_decode_query_len=0,
            max_query_len=self.max_query_len,
            max_prefill_seq_len=self.max_prefill_seq_len,
            max_decode_seq_len=0,
            query_start_loc=query_start_loc,
            seq_start_loc=seq_start_loc,
            context_lens_tensor=context_lens_tensor,
            block_tables=block_tables,
            use_cuda_graph=False,
        )
        return self._cached_prefill_metadata

    @property
    def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
        if self.num_decode_tokens == 0:
            return None

        if self._cached_decode_metadata is not None:
            return self._cached_decode_metadata
        assert self.seq_lens_tensor is not None

        # Placeholders
        slot_mapping = torch.empty(0)
        block_tables = torch.empty(0)
        seq_lens_tensor = (None if self.seq_lens_tensor is None else
                           self.seq_lens_tensor[self.num_prefills:])

        self._cached_decode_metadata = PlaceholderAttentionMetadata(
            num_prefills=0,
            num_prefill_tokens=0,
            num_decode_tokens=self.num_decode_tokens,
            slot_mapping=slot_mapping,
            multi_modal_placeholder_index_maps=None,
            enable_kv_scales_calculation=True,
            seq_lens=None,
            seq_lens_tensor=seq_lens_tensor,
            max_decode_query_len=self.max_decode_query_len,
            max_query_len=None,
            max_prefill_seq_len=0,
            max_decode_seq_len=self.max_decode_seq_len,
            query_start_loc=(self.query_start_loc[self.num_prefills:] -
                             self.query_start_loc[self.num_prefills])
            if self.query_start_loc is not None else None,
            seq_start_loc=self.seq_start_loc[self.num_prefills:]
            if self.seq_start_loc is not None else None,
            context_lens_tensor=None,
            block_tables=block_tables,
            use_cuda_graph=self.use_cuda_graph,
        )
        return self._cached_decode_metadata

    def advance_step(self,
                     model_input: "ModelInputForGPUWithSamplingMetadata",
                     sampled_token_ids: Optional[torch.Tensor],
                     block_size: int,
                     num_seqs: int,
                     num_queries: int,
                     turn_prefills_into_decodes: bool = False):
        """
        Update metadata in-place to advance one decode step.
        """
        # When using cudagraph, the num_seqs is padded to the next captured
        # batch sized, but num_queries tracks the actual number of requests in
        # the batch. For --enforce-eager mode, num_seqs == num_queries
        if num_seqs != num_queries:
            assert num_seqs > num_queries
            assert self.use_cuda_graph

        assert not turn_prefills_into_decodes, \
            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
             "models. turn_prefills_into_decodes is a "
             "Multi-Step + Chunked-Prefill specific parameter.")

        assert self.seq_lens is not None
        assert self.max_decode_seq_len == max(self.seq_lens)

        assert self.num_prefills == 0
        assert self.num_prefill_tokens == 0
        assert self.num_decode_tokens == num_seqs

        assert self.seq_lens is not None
        assert len(self.seq_lens) == num_seqs
        assert self.seq_lens_tensor is not None
        assert self.seq_lens_tensor.shape == (num_seqs, )
        assert self.max_query_len == 1
        assert self.max_prefill_seq_len == 0

        assert self.query_start_loc is not None
        assert self.query_start_loc.shape == (num_queries + 1, )
        assert self.seq_start_loc is not None
        assert self.seq_start_loc.shape == (num_seqs + 1, )

        assert self.context_lens_tensor is not None
        assert self.context_lens_tensor.shape == (num_queries, )

        # Update query lengths. Note that we update only queries and not seqs,
        # since tensors may be padded due to captured cuda graph batch size
        for i in range(num_queries):
            self.seq_lens[i] += 1
        self.max_decode_seq_len = max(self.seq_lens)

        # Update sequences, masking off entries greater than num_queries
        device = self.seq_lens_tensor.device
        mask = torch.arange(self.seq_lens_tensor.size(0),
                            device=device) < num_queries
        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
        if sampled_token_ids is not None:
            model_input.input_tokens.masked_scatter_(
                mask, sampled_token_ids[:num_queries])

_cached_decode_metadata `class-attribute` `instance-attribute` ¶

_cached_decode_metadata: Optional[
    PlaceholderAttentionMetadata
] = None

_cached_prefill_metadata `class-attribute` `instance-attribute` ¶

_cached_prefill_metadata: Optional[
    PlaceholderAttentionMetadata
] = None

block_tables `class-attribute` `instance-attribute` ¶

block_tables: Optional[Tensor] = None

context_lens_tensor `instance-attribute` ¶

context_lens_tensor: Optional[Tensor]

decode_metadata `property` ¶

decode_metadata: Optional[PlaceholderAttentionMetadata]

max_decode_query_len `instance-attribute` ¶

max_decode_query_len: Optional[int]

max_decode_seq_len `instance-attribute` ¶

max_decode_seq_len: int

max_prefill_seq_len `instance-attribute` ¶

max_prefill_seq_len: int

max_query_len `instance-attribute` ¶

max_query_len: Optional[int]

prefill_metadata `property` ¶

prefill_metadata: Optional[PlaceholderAttentionMetadata]

query_start_loc `class-attribute` `instance-attribute` ¶

query_start_loc: Optional[Tensor] = None

seq_lens `instance-attribute` ¶

seq_lens: Optional[List[int]]

seq_lens_tensor `instance-attribute` ¶

seq_lens_tensor: Optional[Tensor]

seq_start_loc `class-attribute` `instance-attribute` ¶

seq_start_loc: Optional[Tensor] = None

use_cuda_graph `instance-attribute` ¶

use_cuda_graph: bool

init ¶

__init__(
    num_prefills: int,
    num_prefill_tokens: int,
    num_decode_tokens: int,
    slot_mapping: Tensor,
    multi_modal_placeholder_index_maps: Optional[
        Dict[str, IndexMap]
    ],
    enable_kv_scales_calculation: bool,
    seq_lens: Optional[List[int]],
    seq_lens_tensor: Optional[Tensor],
    max_prefill_seq_len: int,
    max_decode_seq_len: int,
    context_lens_tensor: Optional[Tensor],
    use_cuda_graph: bool,
    max_query_len: Optional[int],
    max_decode_query_len: Optional[int],
    query_start_loc: Optional[Tensor] = None,
    seq_start_loc: Optional[Tensor] = None,
    block_tables: Optional[Tensor] = None,
    _cached_prefill_metadata: Optional[
        PlaceholderAttentionMetadata
    ] = None,
    _cached_decode_metadata: Optional[
        PlaceholderAttentionMetadata
    ] = None,
) -> None

advance_step ¶

advance_step(
    model_input: ModelInputForGPUWithSamplingMetadata,
    sampled_token_ids: Optional[Tensor],
    block_size: int,
    num_seqs: int,
    num_queries: int,
    turn_prefills_into_decodes: bool = False,
)

Update metadata in-place to advance one decode step.

Source code in vllm/attention/backends/placeholder_attn.py

def advance_step(self,
                 model_input: "ModelInputForGPUWithSamplingMetadata",
                 sampled_token_ids: Optional[torch.Tensor],
                 block_size: int,
                 num_seqs: int,
                 num_queries: int,
                 turn_prefills_into_decodes: bool = False):
    """
    Update metadata in-place to advance one decode step.
    """
    # When using cudagraph, the num_seqs is padded to the next captured
    # batch sized, but num_queries tracks the actual number of requests in
    # the batch. For --enforce-eager mode, num_seqs == num_queries
    if num_seqs != num_queries:
        assert num_seqs > num_queries
        assert self.use_cuda_graph

    assert not turn_prefills_into_decodes, \
        ("Multi-Step + Chunked-Prefill is not supported for attention-free"
         "models. turn_prefills_into_decodes is a "
         "Multi-Step + Chunked-Prefill specific parameter.")

    assert self.seq_lens is not None
    assert self.max_decode_seq_len == max(self.seq_lens)

    assert self.num_prefills == 0
    assert self.num_prefill_tokens == 0
    assert self.num_decode_tokens == num_seqs

    assert self.seq_lens is not None
    assert len(self.seq_lens) == num_seqs
    assert self.seq_lens_tensor is not None
    assert self.seq_lens_tensor.shape == (num_seqs, )
    assert self.max_query_len == 1
    assert self.max_prefill_seq_len == 0

    assert self.query_start_loc is not None
    assert self.query_start_loc.shape == (num_queries + 1, )
    assert self.seq_start_loc is not None
    assert self.seq_start_loc.shape == (num_seqs + 1, )

    assert self.context_lens_tensor is not None
    assert self.context_lens_tensor.shape == (num_queries, )

    # Update query lengths. Note that we update only queries and not seqs,
    # since tensors may be padded due to captured cuda graph batch size
    for i in range(num_queries):
        self.seq_lens[i] += 1
    self.max_decode_seq_len = max(self.seq_lens)

    # Update sequences, masking off entries greater than num_queries
    device = self.seq_lens_tensor.device
    mask = torch.arange(self.seq_lens_tensor.size(0),
                        device=device) < num_queries
    self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
    if sampled_token_ids is not None:
        model_input.input_tokens.masked_scatter_(
            mask, sampled_token_ids[:num_queries])

PlaceholderAttentionMetadataBuilder ¶

Bases: AttentionMetadataBuilder[PlaceholderAttentionMetadata]

Source code in vllm/attention/backends/placeholder_attn.py

class PlaceholderAttentionMetadataBuilder(
        AttentionMetadataBuilder[PlaceholderAttentionMetadata]):

    def __init__(self, input_builder: "ModelInputForGPUBuilder"):

        self.input_builder = input_builder
        self.runner = input_builder.runner

    def prepare(self):
        self.prefill_seq_lens: List[int] = []
        self.context_lens: List[int] = []
        self.curr_seq_lens: List[int] = []
        self.multimodal_placeholder_maps: Dict[
            str,
            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
        self.num_prefills = 0
        self.num_prefill_tokens = 0
        self.num_decode_tokens = 0

    def _add_seq_group(
            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
            chunked_prefill_enabled: bool):
        """Add a sequence group to the metadata. Specifically update/append
        1. context length.
        """
        is_prompt = inter_data.is_prompt

        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
             curr_sliding_window_block) in zip(
                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
                 inter_data.orig_seq_lens, inter_data.seq_lens,
                 inter_data.query_lens, inter_data.context_lens,
                 inter_data.curr_sliding_window_blocks):
            self.context_lens.append(context_len)

            if is_prompt:
                mm_maps = inter_data.multi_modal_placeholder_maps
                if mm_maps:
                    for modality, placeholders in mm_maps.items():
                        self.multimodal_placeholder_maps[modality].extend(
                            placeholders)

                self.num_prefills += 1
                self.num_prefill_tokens += token_len
                self.prefill_seq_lens.append(seq_len)
            else:
                self.num_decode_tokens += query_len
                self.curr_seq_lens.append(curr_seq_len)

    def build(self, seq_lens: List[int], query_lens: List[int],
              cuda_graph_pad_size: int, batch_size: int):
        """Build attention metadata with on-device tensors.

        Args:
            seq_lens: The maybe padded sequence lengths of the input sequences.
            query_lens: The query lengths of the input sequences.
            cuda_graph_pad_size: The padding size for cuda graph.
                                 -1 if cuda graph is not used.
            batch_size: The maybe padded batch size.
        """

        # Some input builders such as ModelInputForCPUBuilder do not have the
        # "inter_data_list" attribute.
        # Let's check inter_data_list exists before we reference it.
        if hasattr(self.input_builder, "inter_data_list"):
            for inter_data in self.input_builder.inter_data_list:
                self._add_seq_group(inter_data,
                                    self.input_builder.chunked_prefill_enabled)

        device = self.runner.device
        use_captured_graph = cuda_graph_pad_size != -1

        max_query_len = max(query_lens)
        decode_query_lens = query_lens[self.num_prefills:]
        if len(decode_query_lens) > 0:
            max_decode_query_len = max(decode_query_lens)
        else:
            max_decode_query_len = 1
        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
        max_decode_seq_len = max(self.curr_seq_lens, default=0)
        num_decode_tokens = self.num_decode_tokens
        query_start_loc = list(accumulate(query_lens, initial=0))
        seq_start_loc = list(accumulate(seq_lens, initial=0))

        if use_captured_graph:
            num_decode_tokens = batch_size - self.num_prefill_tokens
        assert max_query_len > 0, ("query_lens: {}".format(query_lens))

        assert device is not None
        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
                                               device, self.runner.pin_memory)
        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
                                           self.runner.pin_memory)
        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
                                                  device,
                                                  self.runner.pin_memory)
        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
                                                device, self.runner.pin_memory)

        placeholder_index_maps = {
            modality: placeholder_map.index_map()
            for modality, placeholder_map in
            self.multimodal_placeholder_maps.items()
        }

        # Placeholders
        slot_mapping_tensor = torch.empty(0)
        block_tables = torch.empty(0)

        return PlaceholderAttentionMetadata(
            num_prefills=self.num_prefills,
            slot_mapping=slot_mapping_tensor,
            multi_modal_placeholder_index_maps=placeholder_index_maps,
            enable_kv_scales_calculation=True,
            num_prefill_tokens=self.num_prefill_tokens,
            num_decode_tokens=num_decode_tokens,
            seq_lens=seq_lens,
            seq_lens_tensor=seq_lens_tensor,
            max_query_len=max_query_len,
            max_decode_query_len=max_decode_query_len,
            max_prefill_seq_len=max_prefill_seq_len,
            max_decode_seq_len=max_decode_seq_len,
            query_start_loc=query_start_loc_tensor,
            seq_start_loc=seq_start_loc_tensor,
            context_lens_tensor=context_lens_tensor,
            block_tables=block_tables,
            use_cuda_graph=use_captured_graph,
        )

input_builder `instance-attribute` ¶

input_builder = input_builder

runner `instance-attribute` ¶

runner = runner

init ¶

__init__(input_builder: ModelInputForGPUBuilder)

Source code in vllm/attention/backends/placeholder_attn.py

def __init__(self, input_builder: "ModelInputForGPUBuilder"):

    self.input_builder = input_builder
    self.runner = input_builder.runner

_add_seq_group ¶

_add_seq_group(
    inter_data: InterDataForSeqGroup,
    chunked_prefill_enabled: bool,
)

Add a sequence group to the metadata. Specifically update/append 1. context length.

Source code in vllm/attention/backends/placeholder_attn.py

def _add_seq_group(
        self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
        chunked_prefill_enabled: bool):
    """Add a sequence group to the metadata. Specifically update/append
    1. context length.
    """
    is_prompt = inter_data.is_prompt

    for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
         curr_sliding_window_block) in zip(
             inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
             inter_data.orig_seq_lens, inter_data.seq_lens,
             inter_data.query_lens, inter_data.context_lens,
             inter_data.curr_sliding_window_blocks):
        self.context_lens.append(context_len)

        if is_prompt:
            mm_maps = inter_data.multi_modal_placeholder_maps
            if mm_maps:
                for modality, placeholders in mm_maps.items():
                    self.multimodal_placeholder_maps[modality].extend(
                        placeholders)

            self.num_prefills += 1
            self.num_prefill_tokens += token_len
            self.prefill_seq_lens.append(seq_len)
        else:
            self.num_decode_tokens += query_len
            self.curr_seq_lens.append(curr_seq_len)

build ¶

build(
    seq_lens: List[int],
    query_lens: List[int],
    cuda_graph_pad_size: int,
    batch_size: int,
)

Build attention metadata with on-device tensors.

Parameters:

Name	Type	Description	Default
`seq_lens`	`List[int]`	The maybe padded sequence lengths of the input sequences.	required
`query_lens`	`List[int]`	The query lengths of the input sequences.	required
`cuda_graph_pad_size`	`int`	The padding size for cuda graph. -1 if cuda graph is not used.	required
`batch_size`	`int`	The maybe padded batch size.	required

Source code in vllm/attention/backends/placeholder_attn.py

def build(self, seq_lens: List[int], query_lens: List[int],
          cuda_graph_pad_size: int, batch_size: int):
    """Build attention metadata with on-device tensors.

    Args:
        seq_lens: The maybe padded sequence lengths of the input sequences.
        query_lens: The query lengths of the input sequences.
        cuda_graph_pad_size: The padding size for cuda graph.
                             -1 if cuda graph is not used.
        batch_size: The maybe padded batch size.
    """

    # Some input builders such as ModelInputForCPUBuilder do not have the
    # "inter_data_list" attribute.
    # Let's check inter_data_list exists before we reference it.
    if hasattr(self.input_builder, "inter_data_list"):
        for inter_data in self.input_builder.inter_data_list:
            self._add_seq_group(inter_data,
                                self.input_builder.chunked_prefill_enabled)

    device = self.runner.device
    use_captured_graph = cuda_graph_pad_size != -1

    max_query_len = max(query_lens)
    decode_query_lens = query_lens[self.num_prefills:]
    if len(decode_query_lens) > 0:
        max_decode_query_len = max(decode_query_lens)
    else:
        max_decode_query_len = 1
    max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
    max_decode_seq_len = max(self.curr_seq_lens, default=0)
    num_decode_tokens = self.num_decode_tokens
    query_start_loc = list(accumulate(query_lens, initial=0))
    seq_start_loc = list(accumulate(seq_lens, initial=0))

    if use_captured_graph:
        num_decode_tokens = batch_size - self.num_prefill_tokens
    assert max_query_len > 0, ("query_lens: {}".format(query_lens))

    assert device is not None
    context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
                                           device, self.runner.pin_memory)
    seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
                                       self.runner.pin_memory)
    query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
                                              device,
                                              self.runner.pin_memory)
    seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
                                            device, self.runner.pin_memory)

    placeholder_index_maps = {
        modality: placeholder_map.index_map()
        for modality, placeholder_map in
        self.multimodal_placeholder_maps.items()
    }

    # Placeholders
    slot_mapping_tensor = torch.empty(0)
    block_tables = torch.empty(0)

    return PlaceholderAttentionMetadata(
        num_prefills=self.num_prefills,
        slot_mapping=slot_mapping_tensor,
        multi_modal_placeholder_index_maps=placeholder_index_maps,
        enable_kv_scales_calculation=True,
        num_prefill_tokens=self.num_prefill_tokens,
        num_decode_tokens=num_decode_tokens,
        seq_lens=seq_lens,
        seq_lens_tensor=seq_lens_tensor,
        max_query_len=max_query_len,
        max_decode_query_len=max_decode_query_len,
        max_prefill_seq_len=max_prefill_seq_len,
        max_decode_seq_len=max_decode_seq_len,
        query_start_loc=query_start_loc_tensor,
        seq_start_loc=seq_start_loc_tensor,
        context_lens_tensor=context_lens_tensor,
        block_tables=block_tables,
        use_cuda_graph=use_captured_graph,
    )

prepare ¶

prepare()

Source code in vllm/attention/backends/placeholder_attn.py

def prepare(self):
    self.prefill_seq_lens: List[int] = []
    self.context_lens: List[int] = []
    self.curr_seq_lens: List[int] = []
    self.multimodal_placeholder_maps: Dict[
        str,
        MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
    self.num_prefills = 0
    self.num_prefill_tokens = 0
    self.num_decode_tokens = 0

vllm.attention.backends.placeholder_attn

PlaceholderAttentionBackend ¶

copy_blocks staticmethod ¶

get_builder_cls staticmethod ¶

get_impl_cls staticmethod ¶

get_kv_cache_shape staticmethod ¶

get_metadata_cls staticmethod ¶

get_name staticmethod ¶

get_state_cls staticmethod ¶

swap_blocks staticmethod ¶

PlaceholderAttentionImpl ¶

__init__ ¶

forward ¶

PlaceholderAttentionMetadata dataclass ¶

_cached_decode_metadata class-attribute instance-attribute ¶

_cached_prefill_metadata class-attribute instance-attribute ¶

block_tables class-attribute instance-attribute ¶

context_lens_tensor instance-attribute ¶

decode_metadata property ¶

max_decode_query_len instance-attribute ¶

max_decode_seq_len instance-attribute ¶

max_prefill_seq_len instance-attribute ¶

max_query_len instance-attribute ¶

prefill_metadata property ¶

query_start_loc class-attribute instance-attribute ¶

seq_lens instance-attribute ¶

seq_lens_tensor instance-attribute ¶

seq_start_loc class-attribute instance-attribute ¶

use_cuda_graph instance-attribute ¶

__init__ ¶

advance_step ¶

PlaceholderAttentionMetadataBuilder ¶

input_builder instance-attribute ¶

runner instance-attribute ¶

__init__ ¶

_add_seq_group ¶

build ¶

prepare ¶

copy_blocks `staticmethod` ¶

get_builder_cls `staticmethod` ¶

get_impl_cls `staticmethod` ¶

get_kv_cache_shape `staticmethod` ¶

get_metadata_cls `staticmethod` ¶

get_name `staticmethod` ¶

get_state_cls `staticmethod` ¶

swap_blocks `staticmethod` ¶

init ¶

PlaceholderAttentionMetadata `dataclass` ¶

_cached_decode_metadata `class-attribute` `instance-attribute` ¶

_cached_prefill_metadata `class-attribute` `instance-attribute` ¶

block_tables `class-attribute` `instance-attribute` ¶

context_lens_tensor `instance-attribute` ¶

decode_metadata `property` ¶

max_decode_query_len `instance-attribute` ¶

max_decode_seq_len `instance-attribute` ¶

max_prefill_seq_len `instance-attribute` ¶

max_query_len `instance-attribute` ¶

prefill_metadata `property` ¶

query_start_loc `class-attribute` `instance-attribute` ¶

seq_lens `instance-attribute` ¶

seq_lens_tensor `instance-attribute` ¶

seq_start_loc `class-attribute` `instance-attribute` ¶

use_cuda_graph `instance-attribute` ¶

init ¶

input_builder `instance-attribute` ¶

runner `instance-attribute` ¶

init ¶