vllm.attention.ops.hpu_paged_attn

_PARTITION_SIZE `module-attribute` ¶

_PARTITION_SIZE = 512

HPUPagedAttention ¶

Source code in vllm/attention/ops/hpu_paged_attn.py

class HPUPagedAttention:

    @staticmethod
    def get_supported_head_sizes() -> List[int]:
        return [64, 80, 96, 112, 128, 256]

    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
    ) -> Tuple[int, ...]:
        return (num_blocks, block_size, num_kv_heads, head_size)

    @staticmethod
    def split_kv_cache(
        kv_cache: torch.Tensor,
        num_kv_heads: int,
        head_size: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        key_cache = kv_cache[0]
        value_cache = kv_cache[1]
        return key_cache, value_cache

    @staticmethod
    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
                             key_cache: torch.Tensor,
                             value_cache: torch.Tensor,
                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
                             is_prompt: bool) -> None:
        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
                                    slot_mapping, kv_cache_dtype, is_prompt)

    @staticmethod
    def forward_decode(**kwargs) -> torch.Tensor:
        return ops.flat_pa(**kwargs)

    @staticmethod
    def swap_blocks(
        src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
        dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
        src_to_dsts: torch.Tensor,
    ) -> None:
        src_key_cache = src_kv_cache[0]
        dst_key_cache = dst_kv_cache[0]
        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

        src_value_cache = src_kv_cache[1]
        dst_value_cache = dst_kv_cache[1]
        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

    @staticmethod
    def copy_blocks(
        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
        src_to_dsts: torch.Tensor,
    ) -> None:
        key_caches = [kv_cache[0] for kv_cache in kv_caches]
        value_caches = [kv_cache[1] for kv_cache in kv_caches]
        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)

copy_blocks `staticmethod` ¶

copy_blocks(
    kv_caches: List[Tuple[Tensor, Tensor]],
    src_to_dsts: Tensor,
) -> None

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def copy_blocks(
    kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
    src_to_dsts: torch.Tensor,
) -> None:
    key_caches = [kv_cache[0] for kv_cache in kv_caches]
    value_caches = [kv_cache[1] for kv_cache in kv_caches]
    cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)

forward_decode `staticmethod` ¶

forward_decode(**kwargs) -> Tensor

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def forward_decode(**kwargs) -> torch.Tensor:
    return ops.flat_pa(**kwargs)

get_kv_cache_shape `staticmethod` ¶

get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> Tuple[int, ...]

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
) -> Tuple[int, ...]:
    return (num_blocks, block_size, num_kv_heads, head_size)

get_supported_head_sizes `staticmethod` ¶

get_supported_head_sizes() -> List[int]

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def get_supported_head_sizes() -> List[int]:
    return [64, 80, 96, 112, 128, 256]

split_kv_cache `staticmethod` ¶

split_kv_cache(
    kv_cache: Tensor, num_kv_heads: int, head_size: int
) -> Tuple[Tensor, Tensor]

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def split_kv_cache(
    kv_cache: torch.Tensor,
    num_kv_heads: int,
    head_size: int,
) -> Tuple[torch.Tensor, torch.Tensor]:
    key_cache = kv_cache[0]
    value_cache = kv_cache[1]
    return key_cache, value_cache

swap_blocks `staticmethod` ¶

swap_blocks(
    src_kv_cache: Tuple[Tensor, Tensor],
    dst_kv_cache: Tuple[Tensor, Tensor],
    src_to_dsts: Tensor,
) -> None

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def swap_blocks(
    src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
    dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
    src_to_dsts: torch.Tensor,
) -> None:
    src_key_cache = src_kv_cache[0]
    dst_key_cache = dst_kv_cache[0]
    cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)

    src_value_cache = src_kv_cache[1]
    dst_value_cache = dst_kv_cache[1]
    cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)

write_to_paged_cache `staticmethod` ¶

write_to_paged_cache(
    key: Tensor,
    value: Tensor,
    key_cache: Tensor,
    value_cache: Tensor,
    slot_mapping: Tensor,
    kv_cache_dtype: str,
    is_prompt: bool,
) -> None

Source code in vllm/attention/ops/hpu_paged_attn.py

@staticmethod
def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
                         key_cache: torch.Tensor,
                         value_cache: torch.Tensor,
                         slot_mapping: torch.Tensor, kv_cache_dtype: str,
                         is_prompt: bool) -> None:
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
                                slot_mapping, kv_cache_dtype, is_prompt)

HPUPagedAttentionMetadata `dataclass` ¶

Metadata for PagedAttention.

Source code in vllm/attention/ops/hpu_paged_attn.py

@dataclass
class HPUPagedAttentionMetadata:
    """Metadata for PagedAttention."""
    block_list: Optional[torch.Tensor]
    block_mapping: Optional[torch.Tensor]
    block_usage: Optional[torch.Tensor]
    block_indices: Optional[torch.Tensor]
    block_offsets: Optional[torch.Tensor]
    block_groups: Optional[torch.Tensor]

block_groups `instance-attribute` ¶

block_groups: Optional[Tensor]

block_indices `instance-attribute` ¶

block_indices: Optional[Tensor]

block_list `instance-attribute` ¶

block_list: Optional[Tensor]

block_mapping `instance-attribute` ¶

block_mapping: Optional[Tensor]

block_offsets `instance-attribute` ¶

block_offsets: Optional[Tensor]

block_usage `instance-attribute` ¶

block_usage: Optional[Tensor]

init ¶

__init__(
    block_list: Optional[Tensor],
    block_mapping: Optional[Tensor],
    block_usage: Optional[Tensor],
    block_indices: Optional[Tensor],
    block_offsets: Optional[Tensor],
    block_groups: Optional[Tensor],
) -> None

vllm.attention.ops.hpu_paged_attn

_PARTITION_SIZE module-attribute ¶

HPUPagedAttention ¶

copy_blocks staticmethod ¶

forward_decode staticmethod ¶

get_kv_cache_shape staticmethod ¶

get_supported_head_sizes staticmethod ¶

split_kv_cache staticmethod ¶

swap_blocks staticmethod ¶

write_to_paged_cache staticmethod ¶

HPUPagedAttentionMetadata dataclass ¶

block_groups instance-attribute ¶

block_indices instance-attribute ¶

block_list instance-attribute ¶

block_mapping instance-attribute ¶

block_offsets instance-attribute ¶

block_usage instance-attribute ¶

__init__ ¶

_PARTITION_SIZE `module-attribute` ¶

copy_blocks `staticmethod` ¶

forward_decode `staticmethod` ¶

get_kv_cache_shape `staticmethod` ¶

get_supported_head_sizes `staticmethod` ¶

split_kv_cache `staticmethod` ¶

swap_blocks `staticmethod` ¶

write_to_paged_cache `staticmethod` ¶

HPUPagedAttentionMetadata `dataclass` ¶

block_groups `instance-attribute` ¶

block_indices `instance-attribute` ¶

block_list `instance-attribute` ¶

block_mapping `instance-attribute` ¶

block_offsets `instance-attribute` ¶

block_usage `instance-attribute` ¶

init ¶