Skip to content

vllm.model_executor.models.minimax_cache

MinimaxCacheManager

Bases: ConstantSizeCache

Source code in vllm/model_executor/models/minimax_cache.py
class MinimaxCacheManager(ConstantSizeCache):

    def __init__(self, dtype, cache_shape):
        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
        self._minimax_cache = torch.empty(size=cache_shape,
                                          dtype=dtype,
                                          device="cuda")

    @property
    def cache(self):
        return self._minimax_cache

    def _copy_cache(self, from_index: int, to_index: int):
        assert len(self.cache) > 0
        for cache_t in self.cache:
            cache_t[:, to_index].copy_(cache_t[:, from_index],
                                       non_blocking=True)

_minimax_cache instance-attribute

_minimax_cache = empty(
    size=cache_shape, dtype=dtype, device="cuda"
)

cache property

cache

__init__

__init__(dtype, cache_shape)
Source code in vllm/model_executor/models/minimax_cache.py
def __init__(self, dtype, cache_shape):
    super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
    self._minimax_cache = torch.empty(size=cache_shape,
                                      dtype=dtype,
                                      device="cuda")

_copy_cache

_copy_cache(from_index: int, to_index: int)
Source code in vllm/model_executor/models/minimax_cache.py
def _copy_cache(self, from_index: int, to_index: int):
    assert len(self.cache) > 0
    for cache_t in self.cache:
        cache_t[:, to_index].copy_(cache_t[:, from_index],
                                   non_blocking=True)

MinimaxCacheParams dataclass

Source code in vllm/model_executor/models/minimax_cache.py
@dataclass
class MinimaxCacheParams:
    minimax_cache: torch.Tensor = torch.Tensor()
    state_indices_tensor: torch.Tensor = torch.Tensor()

    def at_layer_idx(self, layer_idx):
        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
                                  self.state_indices_tensor)

minimax_cache class-attribute instance-attribute

minimax_cache: Tensor = Tensor()

state_indices_tensor class-attribute instance-attribute

state_indices_tensor: Tensor = Tensor()

__init__

__init__(
    minimax_cache: Tensor = Tensor(),
    state_indices_tensor: Tensor = Tensor(),
) -> None

at_layer_idx

at_layer_idx(layer_idx)
Source code in vllm/model_executor/models/minimax_cache.py
def at_layer_idx(self, layer_idx):
    return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
                              self.state_indices_tensor)