Skip to content

vllm.model_executor.layers.mamba.ops.mamba_ssm

_canonical_cache_dtype

_canonical_cache_dtype(cache_dtype: str) -> str

Canonical key for config lookup. bf16 and fp16 share the same tuned configs because the kernel only sees bit width when accessing state.

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def _canonical_cache_dtype(cache_dtype: str) -> str:
    """Canonical key for config lookup. bf16 and fp16 share the same tuned
    configs because the kernel only sees bit width when accessing state."""
    return "float16" if cache_dtype == "bfloat16" else cache_dtype

_get_default_ssm_launch_config

_get_default_ssm_launch_config(
    dstate: int, is_blackwell: bool
) -> tuple[int, int]

Hard-coded fallback heuristic used when no tuned config is available.

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def _get_default_ssm_launch_config(
    dstate: int,
    is_blackwell: bool,
) -> tuple[int, int]:
    """Hard-coded fallback heuristic used when no tuned config is available."""
    BLOCK_SIZE_M, num_warps = 4, 8
    if dstate <= 16:
        BLOCK_SIZE_M, num_warps = 32, 4
    elif dstate <= 32:
        BLOCK_SIZE_M, num_warps = 16, 4
    elif dstate <= 64:
        BLOCK_SIZE_M, num_warps = 8, 4
    else:
        if is_blackwell:
            BLOCK_SIZE_M, num_warps = 32, 8
        elif dstate <= 128:
            BLOCK_SIZE_M, num_warps = 4, 4
    return BLOCK_SIZE_M, num_warps

_try_get_optimal_ssm_config_cached cached

_try_get_optimal_ssm_config_cached(
    headdim: int,
    dstate: int,
    batch: int,
    nheads: int,
    cache_dtype: str,
    is_blackwell: bool,
) -> tuple[int, int]

Cached resolution. See :func:try_get_optimal_ssm_config.

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@functools.cache
def _try_get_optimal_ssm_config_cached(
    headdim: int,
    dstate: int,
    batch: int,
    nheads: int,
    cache_dtype: str,
    is_blackwell: bool,
) -> tuple[int, int]:
    """Cached resolution. See :func:`try_get_optimal_ssm_config`."""
    effective_batch = batch * nheads
    configs = get_ssm_configs(headdim, dstate, cache_dtype)
    if configs:
        # Pick the closest effective_batch in the tuned grid (MoE strategy).
        closest = min(configs.keys(), key=lambda x: abs(x - effective_batch))
        cfg = configs[closest]
        return cfg["BLOCK_SIZE_M"], cfg["num_warps"]

    return _get_default_ssm_launch_config(dstate, is_blackwell)

get_ssm_config_file_name

get_ssm_config_file_name(
    headdim: int,
    dstate: int,
    cache_dtype: str,
    device_name: str,
) -> str

Return the JSON filename for the given kernel shape.

Layout: configs/selective_state_update/ headdim=<H>,dstate=<D>,device_name=<dev>,cache_dtype=<dt>.json.

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def get_ssm_config_file_name(
    headdim: int, dstate: int, cache_dtype: str, device_name: str
) -> str:
    """Return the JSON filename for the given kernel shape.

    Layout: ``configs/selective_state_update/
    headdim=<H>,dstate=<D>,device_name=<dev>,cache_dtype=<dt>.json``.
    """
    return (
        f"headdim={headdim},dstate={dstate},"
        f"device_name={device_name},cache_dtype={cache_dtype}.json"
    )

get_ssm_configs cached

get_ssm_configs(
    headdim: int, dstate: int, cache_dtype: str
) -> dict[int, Any] | None

Return tuned (BLOCK_SIZE_M, num_warps) configs for selective_state_update keyed by effective_batch = batch * nheads, or None if no config file is found for the (headdim, dstate, cache_dtype, device) combination.

They can be generated with

benchmarks/kernels/benchmark_selective_state_update.py --save-configs

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@functools.cache
def get_ssm_configs(
    headdim: int, dstate: int, cache_dtype: str
) -> dict[int, Any] | None:
    """
    Return tuned (BLOCK_SIZE_M, num_warps) configs for *selective_state_update*
    keyed by ``effective_batch = batch * nheads``, or ``None`` if no config
    file is found for the (headdim, dstate, cache_dtype, device) combination.

    They can be generated with:
        benchmarks/kernels/benchmark_selective_state_update.py --save-configs
    """
    cache_dtype = _canonical_cache_dtype(cache_dtype)
    device_name = get_ssm_device_name()
    json_file_name = get_ssm_config_file_name(headdim, dstate, cache_dtype, device_name)

    config_file_paths: list[str] = []

    # User-supplied override
    user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
    if user_defined_config_folder is not None:
        config_file_paths.append(
            os.path.join(user_defined_config_folder, json_file_name)
        )

    # Bundled default
    config_file_paths.append(os.path.join(_CONFIGS_DIR, json_file_name))

    for path in config_file_paths:
        if os.path.exists(path):
            with open(path) as f:
                logger.info_once(
                    "Using SSM config from %s for selective_state_update.",
                    path,
                    scope="global",
                )
                raw = json.load(f)
                if isinstance(raw, dict):
                    # triton_version included in the config file only for reference
                    raw.pop("triton_version", None)
                    return {int(k): v for k, v in raw.items() if k.isdigit()}

    logger.warning_once(
        "Using default Mamba SSU config. Performance might be sub-optimal! "
        "Config file not found at %s",
        ", ".join(config_file_paths),
    )
    return None

override_ssm_config

override_ssm_config(config: tuple[int, int])

Pin try_get_optimal_ssm_config to config for the duration of the context. Used by the tuning benchmark to time specific configs.

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@contextmanager
def override_ssm_config(config: tuple[int, int]):
    """Pin ``try_get_optimal_ssm_config`` to ``config`` for the duration of
    the context. Used by the tuning benchmark to time specific configs."""
    global _ssm_config_override
    prev = _ssm_config_override
    _ssm_config_override = config
    try:
        yield
    finally:
        _ssm_config_override = prev

selective_scan_fn

selective_scan_fn(
    u,
    ssm_states,
    delta,
    A,
    B,
    C,
    D=None,
    z=None,
    delta_bias=None,
    delta_softplus=False,
    query_start_loc=None,
    cache_indices=None,
    has_initial_state=None,
    null_block_id=NULL_BLOCK_ID,
    block_size=1024,
    block_idx_first_scheduled_token=None,
    block_idx_last_scheduled_token=None,
    initial_state_idx=None,
    cu_chunk_seqlen=None,
    last_chunk_indices=None,
) -> Tensor
(dim, total_length) for varlen or (batch, dim, seqlen)

applies changes in place.

ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate) applies changes in place. delta: (dim, total_length) for varlen or (batch, dim, seqlen) A: (dim, dstate) B: (ngroups, dstate, total_length) for varlen or (batch,ngroups,dstate,seqlen) C: (ngroups, dstate, total_length) for varlen or (batch,ngroups,dstate,seqlen) D: (dim,) z: (dim, total_length) for varlen or (batch, dim, seqlen) dt_bias: (dim,) or (dim) query_start_loc: (batch + 1) int32 The cumulative sequence lengths of the sequences in the batch, used to index into sequence. prepended with 0. for example: query_start_loc = torch.Tensor([0,10,16,17]), x.shape=(dim,17) cache_indices: (batch) int32 A tensor with each cell is a correspondent input and output ssm_state indices - Without APC: (batch,) - single state index per batch item - With APC: (batch, max_positions) - cache block indices for read/write Each non-zero value indicates a cache block to load from and/or write to. has_initial_state: (batch) bool A tensor populated with ones and zeros, indicate if the ssm_state at the corresponding index should be used as initial state. Not providing argument assumes there's no initial state null_block_id: int if cache_indices is passed, lets the kernel identify padding entries that will not be processed, for example: cache_indices = [null_block_id, 1 ,20 ,null_block_id] in this case, the kernel will not process entries at indices 0 and 3 block_size: int The block size to align the cached states to block_idx_first_scheduled_token: (batch,), dtype int32 The pointer into cache_indices, where the first cache block to be filled is located. block_idx_last_scheduled_token: (batch,), dtype int32 The pointer into cache_indices, where the last cache block to be filled is located. initial_state_idx: (batch,), dtype int32 The pointer into cache_indices, where the cache block containing the initial state is located. returns output: (dim, total_length) for varlen or (batch, dim, seqlen) supports inplace replacement

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def selective_scan_fn(
    u,
    ssm_states,
    delta,
    A,
    B,
    C,
    D=None,
    z=None,
    delta_bias=None,
    delta_softplus=False,
    query_start_loc=None,
    cache_indices=None,
    has_initial_state=None,
    null_block_id=NULL_BLOCK_ID,
    block_size=1024,
    block_idx_first_scheduled_token=None,
    block_idx_last_scheduled_token=None,
    initial_state_idx=None,
    cu_chunk_seqlen=None,
    last_chunk_indices=None,
) -> torch.Tensor:
    """
    u: (dim, total_length) for varlen or (batch, dim, seqlen)
        applies changes in place.
    ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate)
        applies changes in place.
    delta: (dim, total_length) for varlen or (batch, dim, seqlen)
    A: (dim, dstate)
    B: (ngroups, dstate, total_length) for varlen or
                                        (batch,ngroups,dstate,seqlen)
    C: (ngroups, dstate, total_length) for varlen or
                                        (batch,ngroups,dstate,seqlen)
    D: (dim,)
    z: (dim, total_length) for varlen or (batch, dim, seqlen)
    dt_bias: (dim,) or (dim)
    query_start_loc: (batch + 1) int32
        The cumulative sequence lengths of the sequences in
        the batch, used to index into sequence. prepended with 0.
        for example: query_start_loc = torch.Tensor([0,10,16,17]),
        x.shape=(dim,17)
    cache_indices: (batch) int32
        A tensor with each cell is a correspondent
        input and output ssm_state indices
      - Without APC: (batch,) - single state index per batch item
      - With APC: (batch, max_positions) - cache block indices for read/write
        Each non-zero value indicates a cache block to load from and/or write to.
    has_initial_state: (batch) bool
        A tensor populated with ones and zeros,
        indicate if the ssm_state at the corresponding index should be
        used as initial state. Not providing argument assumes
        there's no initial state
    null_block_id: int
        if cache_indices is passed, lets the kernel identify padding entries
        that will not be processed,
        for example: cache_indices = [null_block_id, 1 ,20 ,null_block_id]
        in this case, the kernel will not process entries at indices 0 and 3
    block_size: int
        The block size to align the cached states to
    block_idx_first_scheduled_token: (batch,), dtype int32
        The pointer into cache_indices, where the first
        cache block to be filled is located.
    block_idx_last_scheduled_token: (batch,), dtype int32
        The pointer into cache_indices, where the last cache block
        to be filled is located.
    initial_state_idx: (batch,), dtype int32
        The pointer into cache_indices, where the cache block
        containing the initial state is located.
    returns
        output: (dim, total_length) for varlen or (batch, dim, seqlen)
                supports inplace replacement
    """
    if u.stride(-1) != 1:
        u = u.contiguous()
    if delta.stride(-1) != 1:
        delta = delta.contiguous()
    if D is not None:
        D = D.contiguous()
    if B.stride(-1) != 1:
        B = B.contiguous()
    if C.stride(-1) != 1:
        C = C.contiguous()
    if z is not None and z.stride(-1) != 1:
        z = z.contiguous()
    if B.dim() == 3 and query_start_loc is None:
        B = B.unsqueeze(1)
    if B.dim() == 2 and query_start_loc is not None:
        B = B.unsqueeze(0)
    if C.dim() == 3 and query_start_loc is None:
        C = C.unsqueeze(1)
    if C.dim() == 2 and query_start_loc is not None:
        C = C.unsqueeze(0)

    ops.selective_scan_fwd(
        u,
        delta,
        A,
        B,
        C,
        D,
        z,
        delta_bias,
        delta_softplus,
        query_start_loc,
        cache_indices,
        has_initial_state,
        ssm_states,
        null_block_id,
        block_size,
        block_idx_first_scheduled_token,
        block_idx_last_scheduled_token,
        initial_state_idx,
        cu_chunk_seqlen,
        last_chunk_indices,
    )

    if z is None:
        return delta  # output written inplace to delta
    else:
        return z  # output written inplace to z

selective_state_update

selective_state_update(
    state,
    x,
    dt,
    A,
    B,
    C,
    D,
    dt_bias,
    z=None,
    dt_softplus=False,
    state_batch_indices=None,
    dst_state_batch_indices=None,
    null_block_id=NULL_BLOCK_ID,
    out=None,
    num_accepted_tokens=None,
    cu_seqlens=None,
    is_blackwell=False,
    enable_stochastic_rounding=False,
    cache_philox_rounds=0,
)
Argument

state: (batch, dim, dstate) or (batch, nheads, dim, dstate) x: (batch, dim) or (batch, nheads, dim) dt: (batch, dim) or (batch, nheads, dim) A: (dim, dstate) or (nheads, dim, dstate) B: (batch, dstate) or (batch, ngroups, dstate) C: (batch, dstate) or (batch, ngroups, dstate) D: (dim,) or (nheads, dim) z: (batch, dim) or (batch, nheads, dim) dt_bias: (dim,) or (nheads, dim) null_block_id: int if state_batch_indices is passed, lets the kernel identify padded entries that will not be processed, for example: state_batch_indices = [null_block_id, 1, 20, null_block_id] in this case, the kernel will not process entries at indices 0 and 3 out: Preallocated ssm output tensor. Assume same shape as x. In-place updated. num_accepted_tokens: (batch,) number of accepted tokens from previous verification step, tells the kernel which initial state to use cu_seqlens: (batch,) length per sequence, for variable length in speculative decoding cases

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def selective_state_update(
    state,
    x,
    dt,
    A,
    B,
    C,
    D,
    dt_bias,
    z=None,
    dt_softplus=False,
    state_batch_indices=None,
    dst_state_batch_indices=None,
    null_block_id=NULL_BLOCK_ID,
    out=None,
    num_accepted_tokens=None,
    cu_seqlens=None,
    is_blackwell=False,
    enable_stochastic_rounding=False,
    cache_philox_rounds=0,
):
    """
    Argument:
        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
        x: (batch, dim) or (batch, nheads, dim)
        dt: (batch, dim) or (batch, nheads, dim)
        A: (dim, dstate) or (nheads, dim, dstate)
        B: (batch, dstate) or (batch, ngroups, dstate)
        C: (batch, dstate) or (batch, ngroups, dstate)
        D: (dim,) or (nheads, dim)
        z: (batch, dim) or (batch, nheads, dim)
        dt_bias: (dim,) or (nheads, dim)
        null_block_id: int
            if state_batch_indices is passed, lets the kernel identify
            padded entries that will not be processed,
            for example: state_batch_indices = [null_block_id, 1, 20,
            null_block_id] in this case, the kernel will not process
            entries at indices 0 and 3
        out: Preallocated ssm output tensor. Assume same shape as x.
             In-place updated.
        num_accepted_tokens: (batch,)
            number of accepted tokens from previous verification step,
            tells the kernel which initial state to use
        cu_seqlens: (batch,)
            length per sequence, for variable length in speculative decoding cases
    """
    if state.dim() == 3:
        state = state.unsqueeze(1)
    if x.dim() == 2:
        x = x.unsqueeze(1)
    if dt.dim() == 2:
        dt = dt.unsqueeze(1)
    if A.dim() == 2:
        A = A.unsqueeze(0)
    if B.dim() == 2:
        B = B.unsqueeze(1)
    if C.dim() == 2:
        C = C.unsqueeze(1)
    if D.dim() == 1:
        D = D.unsqueeze(0)
    if z is not None and z.dim() == 2:
        z = z.unsqueeze(1)
    if dt_bias.dim() == 1:
        dt_bias = dt_bias.unsqueeze(0)
    if out.dim() == 2:
        out = out.unsqueeze(1)
    if state_batch_indices is not None and state_batch_indices.dim() == 1:
        state_batch_indices = state_batch_indices.unsqueeze(1)
    if dst_state_batch_indices is not None and dst_state_batch_indices.dim() == 1:
        dst_state_batch_indices = dst_state_batch_indices.unsqueeze(1)
    if num_accepted_tokens is not None:
        assert state_batch_indices is not None and state_batch_indices.dim() == 2
        assert dst_state_batch_indices is None or dst_state_batch_indices.dim() == 2

    _, nheads, dim, dstate = state.shape
    batch = x.shape[0]
    if cu_seqlens is not None:
        N = len(cu_seqlens) - 1
        # Only used to verify the shape of
        # state_batch_indices and dst_state_batch_indices
        max_seqlen = (
            state_batch_indices.size(-1) if state_batch_indices is not None else 1
        )
    else:
        N = batch
        max_seqlen = 1

    assert x.shape == (batch, nheads, dim)
    assert dt.shape == x.shape
    assert A.shape == (nheads, dim, dstate)
    ngroups = B.shape[1]
    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
    assert B.shape == (batch, ngroups, dstate)
    assert C.shape == B.shape
    assert D.shape == (nheads, dim)
    if z is not None:
        assert z.shape == x.shape
    assert dt_bias.shape == (nheads, dim)
    if state_batch_indices is not None:
        assert state_batch_indices.shape[0] >= N
        assert state_batch_indices.shape[1] >= max_seqlen
    if dst_state_batch_indices is not None:
        assert dst_state_batch_indices.shape[0] >= N
        assert dst_state_batch_indices.shape[1] >= max_seqlen
    else:
        # revert to the default behavior of in-place state updates
        dst_state_batch_indices = state_batch_indices
    assert out.shape == x.shape
    if num_accepted_tokens is not None:
        assert num_accepted_tokens.shape == (N,)

    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), N, nheads)
    z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)
    state_batch_indices_strides = (
        (state_batch_indices.stride(0), state_batch_indices.stride(1))
        if state_batch_indices is not None
        else (0, 0)
    )
    dst_state_batch_indices_strides = (
        (dst_state_batch_indices.stride(0), dst_state_batch_indices.stride(1))
        if dst_state_batch_indices is not None
        else (0, 0)
    )
    # We don't want autotune since it will overwrite the state.
    # Load from JSON config if available, otherwise fall back to heuristic.
    cache_dtype = str(state.dtype).removeprefix("torch.")
    BLOCK_SIZE_M, num_warps = try_get_optimal_ssm_config(
        dim, dstate, N, nheads, cache_dtype, is_blackwell
    )

    tie_hdim = (
        A.stride(-1) == 0
        and A.stride(-2) == 0
        and dt.stride(-1) == 0
        and dt_bias.stride(-1) == 0
    )
    rand_seed = (
        torch.randint(0, 2**32, (1,), device=state.device)
        if enable_stochastic_rounding
        else None
    )

    with torch.accelerator.device_index(x.device.index):
        _selective_scan_update_kernel[grid](
            state,
            rand_seed,
            x,
            dt,
            dt_bias,
            A,
            B,
            C,
            D,
            z,
            out,
            state_batch_indices,
            dst_state_batch_indices,
            null_block_id,
            num_accepted_tokens,
            cu_seqlens,
            N,
            nheads,
            dim,
            dstate,
            nheads // ngroups,
            state.stride(0),
            state.stride(1),
            state.stride(2),
            state.stride(3),
            x.stride(0),
            x.stride(1),
            x.stride(2),
            dt.stride(0),
            dt.stride(1),
            dt.stride(2),
            dt_bias.stride(0),
            dt_bias.stride(1),
            A.stride(0),
            A.stride(1),
            A.stride(2),
            B.stride(0),
            B.stride(1),
            B.stride(2),
            C.stride(0),
            C.stride(1),
            C.stride(2),
            D.stride(0),
            D.stride(1),
            z_strides[0],
            z_strides[1],
            z_strides[2],
            out.stride(0),
            out.stride(1),
            out.stride(2),
            state_batch_indices_strides[0],
            state_batch_indices_strides[1],
            dst_state_batch_indices_strides[0],
            dst_state_batch_indices_strides[1],
            dt_softplus,
            tie_hdim,
            BLOCK_SIZE_M,
            num_warps=num_warps,
            USE_RS_ROUNDING=enable_stochastic_rounding,
            PHILOX_ROUNDS=cache_philox_rounds,
        )

try_get_optimal_ssm_config

try_get_optimal_ssm_config(
    headdim: int,
    dstate: int,
    batch: int,
    nheads: int,
    cache_dtype: str,
    is_blackwell: bool,
) -> tuple[int, int]

Return (BLOCK_SIZE_M, num_warps) for the given kernel shape.

Tuning is keyed on effective_batch = batch * nheads (the kernel grid scales with the product), so configs transfer across (model, TP) combos sharing (headdim, dstate, cache_dtype).

Source code in vllm/model_executor/layers/mamba/ops/mamba_ssm.py
def try_get_optimal_ssm_config(
    headdim: int,
    dstate: int,
    batch: int,
    nheads: int,
    cache_dtype: str,
    is_blackwell: bool,
) -> tuple[int, int]:
    """Return (BLOCK_SIZE_M, num_warps) for the given kernel shape.

    Tuning is keyed on ``effective_batch = batch * nheads`` (the kernel grid
    scales with the product), so configs transfer across (model, TP) combos
    sharing ``(headdim, dstate, cache_dtype)``.
    """
    if _ssm_config_override is not None:
        return _ssm_config_override
    return _try_get_optimal_ssm_config_cached(
        headdim, dstate, batch, nheads, cache_dtype, is_blackwell
    )