vllm.model_executor.models.minimax_cache

MinimaxCacheManager ¶

Bases: ConstantSizeCache

Source code in vllm/model_executor/models/minimax_cache.py

class MinimaxCacheManager(ConstantSizeCache):

    def __init__(self, dtype, cache_shape):
        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
        self._minimax_cache = torch.empty(size=cache_shape,
                                          dtype=dtype,
                                          device="cuda")

    @property
    def cache(self):
        return self._minimax_cache

    def _copy_cache(self, from_index: int, to_index: int):
        assert len(self.cache) > 0
        for cache_t in self.cache:
            cache_t[:, to_index].copy_(cache_t[:, from_index],
                                       non_blocking=True)

_minimax_cache `instance-attribute` ¶

_minimax_cache = empty(
    size=cache_shape, dtype=dtype, device="cuda"
)

cache `property` ¶

cache

init ¶

__init__(dtype, cache_shape)

Source code in vllm/model_executor/models/minimax_cache.py

def __init__(self, dtype, cache_shape):
    super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
    self._minimax_cache = torch.empty(size=cache_shape,
                                      dtype=dtype,
                                      device="cuda")

_copy_cache ¶

_copy_cache(from_index: int, to_index: int)

Source code in vllm/model_executor/models/minimax_cache.py

def _copy_cache(self, from_index: int, to_index: int):
    assert len(self.cache) > 0
    for cache_t in self.cache:
        cache_t[:, to_index].copy_(cache_t[:, from_index],
                                   non_blocking=True)

MinimaxCacheParams `dataclass` ¶

Source code in vllm/model_executor/models/minimax_cache.py

@dataclass
class MinimaxCacheParams:
    minimax_cache: torch.Tensor = torch.Tensor()
    state_indices_tensor: torch.Tensor = torch.Tensor()

    def at_layer_idx(self, layer_idx):
        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
                                  self.state_indices_tensor)

minimax_cache `class-attribute` `instance-attribute` ¶

minimax_cache: Tensor = Tensor()

state_indices_tensor `class-attribute` `instance-attribute` ¶

state_indices_tensor: Tensor = Tensor()

init ¶

__init__(
    minimax_cache: Tensor = Tensor(),
    state_indices_tensor: Tensor = Tensor(),
) -> None

at_layer_idx ¶

at_layer_idx(layer_idx)

Source code in vllm/model_executor/models/minimax_cache.py

def at_layer_idx(self, layer_idx):
    return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
                              self.state_indices_tensor)

vllm.model_executor.models.minimax_cache

MinimaxCacheManager ¶

_minimax_cache instance-attribute ¶

cache property ¶

__init__ ¶

_copy_cache ¶

MinimaxCacheParams dataclass ¶

minimax_cache class-attribute instance-attribute ¶

state_indices_tensor class-attribute instance-attribute ¶

__init__ ¶

at_layer_idx ¶

_minimax_cache `instance-attribute` ¶

cache `property` ¶

init ¶

MinimaxCacheParams `dataclass` ¶

minimax_cache `class-attribute` `instance-attribute` ¶

state_indices_tensor `class-attribute` `instance-attribute` ¶

init ¶