Skip to content

vllm.core.interfaces

AllocStatus

Bases: Enum

Result for BlockSpaceManager.can_allocate

  1. Ok: seq_group can be allocated now.
  2. Later: seq_group cannot be allocated. The capacity of allocator is larger than seq_group required.
  3. Never: seq_group can never be allocated. The seq_group is too large to allocated in GPU.
Source code in vllm/core/interfaces.py
class AllocStatus(enum.Enum):
    """Result for BlockSpaceManager.can_allocate

    1. Ok: seq_group can be allocated now.
    2. Later: seq_group cannot be allocated.
      The capacity of allocator is larger than seq_group required.
    3. Never: seq_group can never be allocated.
      The seq_group is too large to allocated in GPU.
    """
    OK = enum.auto()
    LATER = enum.auto()
    NEVER = enum.auto()

LATER class-attribute instance-attribute

LATER = auto()

NEVER class-attribute instance-attribute

NEVER = auto()

OK class-attribute instance-attribute

OK = auto()

BlockSpaceManager

Bases: ABC

Source code in vllm/core/interfaces.py
class BlockSpaceManager(ABC):

    @staticmethod
    def get_block_space_manager_class(version: str):
        version = version.lower()

        if version == "selfattn":
            from vllm.core.block_manager import SelfAttnBlockSpaceManager
            return SelfAttnBlockSpaceManager

        if version == "placeholder":
            from vllm.core.placeholder_block_space_manager import (
                PlaceholderBlockSpaceManager)
            return PlaceholderBlockSpaceManager

        raise ValueError(f"Unknown version {version=}")

    @abstractmethod
    def can_allocate(self,
                     seq_group: SequenceGroup,
                     num_lookahead_slots: int = 0) -> AllocStatus:
        pass

    @abstractmethod
    def allocate(self, seq_group: SequenceGroup) -> None:
        pass

    @abstractmethod
    def can_append_slots(self, seq_group: SequenceGroup,
                         num_lookahead_slots: int) -> bool:
        pass

    @abstractmethod
    def append_slots(
        self,
        seq: Sequence,
        num_lookahead_slots: int,
    ) -> List[Tuple[int, int]]:
        pass

    @abstractmethod
    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
        pass

    @abstractmethod
    def can_swap_in(self, seq_group: SequenceGroup,
                    num_lookahead_slots: int) -> AllocStatus:
        pass

    @abstractmethod
    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
        pass

    @abstractmethod
    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
        pass

    @abstractmethod
    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
        pass

    @abstractmethod
    def free(self, seq: Sequence) -> None:
        pass

    @abstractmethod
    def get_block_table(self, seq: Sequence) -> List[int]:
        pass

    @abstractmethod
    def get_num_free_gpu_blocks(self) -> int:
        pass

    @abstractmethod
    def get_num_free_cpu_blocks(self) -> int:
        pass

    @abstractmethod
    def access_all_blocks_in_seq(
        self,
        seq: Sequence,
        access_time: float,
    ) -> None:
        pass

    @abstractmethod
    def get_common_computed_block_ids(
            self, seqs: List[Sequence]) -> GenericSequence[int]:
        pass

    @abstractmethod
    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
                                token_chunk_size: int):
        pass

    @abstractmethod
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        """Prefix cache hit rate. -1 means not supported or disabled."""
        pass

    @abstractmethod
    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        """Reset prefix cache for specified or all devices."""
        pass

    @abstractmethod
    def get_num_cached_tokens(self, seq: Sequence) -> int:
        pass

    @abstractmethod
    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
        pass

access_all_blocks_in_seq abstractmethod

access_all_blocks_in_seq(
    seq: Sequence, access_time: float
) -> None
Source code in vllm/core/interfaces.py
@abstractmethod
def access_all_blocks_in_seq(
    self,
    seq: Sequence,
    access_time: float,
) -> None:
    pass

allocate abstractmethod

allocate(seq_group: SequenceGroup) -> None
Source code in vllm/core/interfaces.py
@abstractmethod
def allocate(self, seq_group: SequenceGroup) -> None:
    pass

append_slots abstractmethod

append_slots(
    seq: Sequence, num_lookahead_slots: int
) -> List[Tuple[int, int]]
Source code in vllm/core/interfaces.py
@abstractmethod
def append_slots(
    self,
    seq: Sequence,
    num_lookahead_slots: int,
) -> List[Tuple[int, int]]:
    pass

can_allocate abstractmethod

can_allocate(
    seq_group: SequenceGroup, num_lookahead_slots: int = 0
) -> AllocStatus
Source code in vllm/core/interfaces.py
@abstractmethod
def can_allocate(self,
                 seq_group: SequenceGroup,
                 num_lookahead_slots: int = 0) -> AllocStatus:
    pass

can_append_slots abstractmethod

can_append_slots(
    seq_group: SequenceGroup, num_lookahead_slots: int
) -> bool
Source code in vllm/core/interfaces.py
@abstractmethod
def can_append_slots(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> bool:
    pass

can_swap_in abstractmethod

can_swap_in(
    seq_group: SequenceGroup, num_lookahead_slots: int
) -> AllocStatus
Source code in vllm/core/interfaces.py
@abstractmethod
def can_swap_in(self, seq_group: SequenceGroup,
                num_lookahead_slots: int) -> AllocStatus:
    pass

can_swap_out abstractmethod

can_swap_out(seq_group: SequenceGroup) -> bool
Source code in vllm/core/interfaces.py
@abstractmethod
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
    pass

fork abstractmethod

fork(parent_seq: Sequence, child_seq: Sequence) -> None
Source code in vllm/core/interfaces.py
@abstractmethod
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
    pass

free abstractmethod

free(seq: Sequence) -> None
Source code in vllm/core/interfaces.py
@abstractmethod
def free(self, seq: Sequence) -> None:
    pass

get_block_space_manager_class staticmethod

get_block_space_manager_class(version: str)
Source code in vllm/core/interfaces.py
@staticmethod
def get_block_space_manager_class(version: str):
    version = version.lower()

    if version == "selfattn":
        from vllm.core.block_manager import SelfAttnBlockSpaceManager
        return SelfAttnBlockSpaceManager

    if version == "placeholder":
        from vllm.core.placeholder_block_space_manager import (
            PlaceholderBlockSpaceManager)
        return PlaceholderBlockSpaceManager

    raise ValueError(f"Unknown version {version=}")

get_block_table abstractmethod

get_block_table(seq: Sequence) -> List[int]
Source code in vllm/core/interfaces.py
@abstractmethod
def get_block_table(self, seq: Sequence) -> List[int]:
    pass

get_common_computed_block_ids abstractmethod

get_common_computed_block_ids(
    seqs: List[Sequence],
) -> Sequence[int]
Source code in vllm/core/interfaces.py
@abstractmethod
def get_common_computed_block_ids(
        self, seqs: List[Sequence]) -> GenericSequence[int]:
    pass

get_num_cached_tokens abstractmethod

get_num_cached_tokens(seq: Sequence) -> int
Source code in vllm/core/interfaces.py
@abstractmethod
def get_num_cached_tokens(self, seq: Sequence) -> int:
    pass

get_num_free_cpu_blocks abstractmethod

get_num_free_cpu_blocks() -> int
Source code in vllm/core/interfaces.py
@abstractmethod
def get_num_free_cpu_blocks(self) -> int:
    pass

get_num_free_gpu_blocks abstractmethod

get_num_free_gpu_blocks() -> int
Source code in vllm/core/interfaces.py
@abstractmethod
def get_num_free_gpu_blocks(self) -> int:
    pass

get_prefix_cache_hit_rate abstractmethod

get_prefix_cache_hit_rate(device: Device) -> float

Prefix cache hit rate. -1 means not supported or disabled.

Source code in vllm/core/interfaces.py
@abstractmethod
def get_prefix_cache_hit_rate(self, device: Device) -> float:
    """Prefix cache hit rate. -1 means not supported or disabled."""
    pass

mark_blocks_as_computed abstractmethod

mark_blocks_as_computed(
    seq_group: SequenceGroup, token_chunk_size: int
)
Source code in vllm/core/interfaces.py
@abstractmethod
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
                            token_chunk_size: int):
    pass

remove_seq_from_computed_blocks_tracker abstractmethod

remove_seq_from_computed_blocks_tracker(
    seq: Sequence,
) -> None
Source code in vllm/core/interfaces.py
@abstractmethod
def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
    pass

reset_prefix_cache abstractmethod

reset_prefix_cache(device: Optional[Device] = None) -> bool

Reset prefix cache for specified or all devices.

Source code in vllm/core/interfaces.py
@abstractmethod
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
    """Reset prefix cache for specified or all devices."""
    pass

swap_in abstractmethod

swap_in(seq_group: SequenceGroup) -> List[Tuple[int, int]]
Source code in vllm/core/interfaces.py
@abstractmethod
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
    pass

swap_out abstractmethod

swap_out(seq_group: SequenceGroup) -> List[Tuple[int, int]]
Source code in vllm/core/interfaces.py
@abstractmethod
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
    pass