vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

BatchedDeepGemmExperts ¶

Bases: FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
    def __init__(
        self,
        max_num_tokens: int,
        num_dispatchers: int,
        quant_config: FusedMoEQuantConfig,
    ):
        """
        max_num_tokens: Maximum number of tokens from a DP Rank
        num_dispatchers: The number of DP dispatchers.
        quant_config: Quantization configuration
        """
        super().__init__(quant_config)
        assert self.block_shape == get_mk_alignment_for_contiguous_layout()
        self.max_num_tokens = max_num_tokens
        self.num_dispatchers = num_dispatchers

    @property
    def activation_formats(
        self,
    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
        return (
            mk.FusedMoEActivationFormat.BatchedExperts,
            mk.FusedMoEActivationFormat.BatchedExperts,
        )

    def supports_chunking(self) -> bool:
        return False

    def supports_expert_map(self) -> bool:
        return False

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
        return TopKWeightAndReduceDelegate()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # FIXME (varun): We should be able to dispatch only from the leader
        # DP ranks in the case of TP > 1. At the moment, all the Ranks
        # end up sending their tokens. This needs to be fixed.
        num_dispatchers = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
        workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
        output = (num_experts, max_num_tokens * num_dispatchers, K)
        return (workspace13, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: str,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        assert expert_tokens_meta is not None
        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        assert hidden_states.ndim == 3
        assert self.block_shape is not None

        a1q = hidden_states
        _, N, K = w1.size()

        assert w2.size(1) == K

        E, max_num_tokens, N, K, _ = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))

        # (from deepgemm docs) : A value hint (which is a value on CPU)
        # for the M expectation of each batch, correctly setting this value
        # may lead to better performance.
        expected_m = max_num_tokens
        fp8_m_grouped_gemm_nt_masked(
            (a1q, a1q_scale),
            (w1, self.w1_scale),
            workspace1,
            expert_num_tokens,
            expected_m,
        )

        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
            workspace1, expert_num_tokens
        )

        fp8_m_grouped_gemm_nt_masked(
            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
        )

activation_formats `property` ¶

activation_formats: tuple[
    FusedMoEActivationFormat, FusedMoEActivationFormat
]

max_num_tokens `instance-attribute` ¶

max_num_tokens = max_num_tokens

num_dispatchers `instance-attribute` ¶

num_dispatchers = num_dispatchers

init ¶

__init__(
    max_num_tokens: int,
    num_dispatchers: int,
    quant_config: FusedMoEQuantConfig,
)

max_num_tokens: Maximum number of tokens from a DP Rank num_dispatchers: The number of DP dispatchers. quant_config: Quantization configuration

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def __init__(
    self,
    max_num_tokens: int,
    num_dispatchers: int,
    quant_config: FusedMoEQuantConfig,
):
    """
    max_num_tokens: Maximum number of tokens from a DP Rank
    num_dispatchers: The number of DP dispatchers.
    quant_config: Quantization configuration
    """
    super().__init__(quant_config)
    assert self.block_shape == get_mk_alignment_for_contiguous_layout()
    self.max_num_tokens = max_num_tokens
    self.num_dispatchers = num_dispatchers

apply ¶

apply(
    output: Tensor,
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: Tensor | None,
    a1q_scale: Tensor | None,
    a2_scale: Tensor | None,
    workspace13: Tensor,
    workspace2: Tensor,
    expert_tokens_meta: ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
)

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
):
    assert expert_tokens_meta is not None
    expert_num_tokens = expert_tokens_meta.expert_num_tokens

    assert hidden_states.ndim == 3
    assert self.block_shape is not None

    a1q = hidden_states
    _, N, K = w1.size()

    assert w2.size(1) == K

    E, max_num_tokens, N, K, _ = self.moe_problem_size(
        hidden_states, w1, w2, topk_ids
    )

    workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))

    # (from deepgemm docs) : A value hint (which is a value on CPU)
    # for the M expectation of each batch, correctly setting this value
    # may lead to better performance.
    expected_m = max_num_tokens
    fp8_m_grouped_gemm_nt_masked(
        (a1q, a1q_scale),
        (w1, self.w1_scale),
        workspace1,
        expert_num_tokens,
        expected_m,
    )

    a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
        workspace1, expert_num_tokens
    )

    fp8_m_grouped_gemm_nt_masked(
        (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
    )

finalize_weight_and_reduce_impl ¶

finalize_weight_and_reduce_impl() -> TopKWeightAndReduce

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
    # Let PrepareAndFinalize::finalize() decide the impl.
    return TopKWeightAndReduceDelegate()

supports_chunking ¶

supports_chunking() -> bool

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def supports_chunking(self) -> bool:
    return False

supports_expert_map ¶

supports_expert_map() -> bool

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def supports_expert_map(self) -> bool:
    return False

workspace_shapes ¶

workspace_shapes(
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
) -> tuple[
    tuple[int, ...], tuple[int, ...], tuple[int, ...]
]

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def workspace_shapes(
    self,
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
    # FIXME (varun): We should be able to dispatch only from the leader
    # DP ranks in the case of TP > 1. At the moment, all the Ranks
    # end up sending their tokens. This needs to be fixed.
    num_dispatchers = self.num_dispatchers
    num_experts = local_num_experts
    max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
    workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
    workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
    output = (num_experts, max_num_tokens * num_dispatchers, K)
    return (workspace13, workspace2, output)

_silu_mul_fp8_quant_deep_gemm ¶

_silu_mul_fp8_quant_deep_gemm(
    input_ptr,
    y_q_ptr,
    y_s_ptr,
    counts_ptr,
    H: constexpr,
    GROUP_SIZE: constexpr,
    stride_i_e,
    stride_i_t,
    stride_i_h,
    stride_yq_e,
    stride_yq_t,
    stride_yq_h,
    stride_ys_e,
    stride_ys_t,
    stride_ys_g,
    stride_counts_e,
    eps: constexpr,
    fp8_min: constexpr,
    fp8_max: constexpr,
    use_ue8m0: constexpr,
    BLOCK: constexpr,
    NUM_STAGES: constexpr,
)

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

@triton.jit
def _silu_mul_fp8_quant_deep_gemm(
    # Pointers ------------------------------------------------------------
    input_ptr,  # 16-bit activations (E, T, 2*H)
    y_q_ptr,  # fp8 quantized activations (E, T, H)
    y_s_ptr,  # 16-bit scales (E, T, G)
    counts_ptr,  # int32 num tokens per expert (E)
    # Sizes ---------------------------------------------------------------
    H: tl.constexpr,  # hidden dimension (per output)
    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
    # Strides for input (elements) ---------------------------------------
    stride_i_e,
    stride_i_t,
    stride_i_h,
    # Strides for y_q (elements) -----------------------------------------
    stride_yq_e,
    stride_yq_t,
    stride_yq_h,
    # Strides for y_s (elements) -----------------------------------------
    stride_ys_e,
    stride_ys_t,
    stride_ys_g,
    # Stride for counts (elements)
    stride_counts_e,
    # Numeric params ------------------------------------------------------
    eps: tl.constexpr,
    fp8_min: tl.constexpr,
    fp8_max: tl.constexpr,
    use_ue8m0: tl.constexpr,
    # Meta ---------------------------------------------------------------
    BLOCK: tl.constexpr,
    NUM_STAGES: tl.constexpr,
):
    G = H // GROUP_SIZE

    # map program id -> (e, g)
    pid = tl.program_id(0)
    e = pid // G
    g = pid % G

    e = e.to(tl.int64)
    g = g.to(tl.int64)

    # number of valid tokens for this expert
    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)

    cols = tl.arange(0, BLOCK).to(tl.int64)
    mask = cols < BLOCK

    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
    base_gate_offset = base_input_offset + cols * stride_i_h
    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
    base_ys_offset = e * stride_ys_e + g * stride_ys_g

    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
        gate = tl.load(
            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
        ).to(tl.float32)
        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)

        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
        y = gate * up

        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
        if use_ue8m0:
            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))

        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)

        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)

persistent_masked_m_silu_mul_quant ¶

persistent_masked_m_silu_mul_quant(
    y: Tensor,
    tokens_per_expert: Tensor,
    num_parallel_tokens=16,
    group_size: int = 128,
) -> tuple[Tensor, Tensor]

Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales y has shape (E, T, 2*H). The first half of the last dimension is silu-activated, multiplied by the second half, then quantized into FP8. We launch a fixed grid of threads to accommodate CUDA graphs. Let P2 be a parallelization factor for persistent_masked_m_silu_mul_quant over the hidden dimension.

Let expert_offsets = [0] + [num_tokens.cumsum()] and total_tokens = expert_offsets[-1]. persistent_masked_m_silu_mul_quant launches total_tokens x P2 number of thread blocks. Each thread block contains NUM_WARPS warps.

Every thread block needs to find it's corresponding expert by warp-parallel scanning over the expert_offsets array.

The i-th warp in the first thread block processes [i * warp_chunk_size, (i + 1) * warp_chunk_size] groups sequentially, where warp_chunk_size = ((H / GROUP_SIZE) / P2) / NUM_WARPS, pipelining loads and computes.

The shared memory layout for 4 warps with a 2-stage pipeline for SiLU V2 can is visualized like so:

                 stage0                              stage1

┌─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┐ │gate0│up0│gate1│up1│gate2│up2│gate3│up3│gate0│up0│gate1│up1│gate2│up2│gate3│up3│ └─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┘

with the main difference between V1 and V2 being the global load stride between warps, and between half-warps. Regarding the latter stride, we assign the first half warp of every warp for gate loads and the second half-warp to up loads.

Returns (y_q, y_s) where * y_q: FP8 tensor, shape (E, T, H), same layout as y[..., :H] * y_s: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) Let NUM_WARPS be the number of warps in a single thread block and GROUP_SIZE = 128 be the size of the quantization group.

Source code in vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

def persistent_masked_m_silu_mul_quant(
    y: torch.Tensor,  # (E, T, 2*H)
    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
    num_parallel_tokens=16,
    group_size: int = 128,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
    y has shape (E, T, 2*H). The first half of the last dimension is
    silu-activated, multiplied by the second half, then quantized into FP8.
    We launch a fixed grid of threads to accommodate CUDA graphs. Let `P2`
    be a parallelization factor for persistent_masked_m_silu_mul_quant over the
    hidden dimension.

    Let `expert_offsets = [0] + [num_tokens.cumsum()]` and
    `total_tokens = expert_offsets[-1]`.
    persistent_masked_m_silu_mul_quant launches `total_tokens x P2` number of
    thread blocks. Each thread block contains `NUM_WARPS` warps.

    Every thread block needs to find it's corresponding expert by warp-parallel scanning
    over the `expert_offsets` array.

    The i-th warp in the first thread block processes
    `[i * warp_chunk_size, (i + 1) * warp_chunk_size]` groups
    sequentially, where `warp_chunk_size = ((H / GROUP_SIZE) / P2) / NUM_WARPS`,
    pipelining loads and computes.

    The shared memory layout for 4 warps with a 2-stage pipeline for SiLU V2
    can is visualized like so:

                         stage0                              stage1
    ┌─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┐
    │gate0│up0│gate1│up1│gate2│up2│gate3│up3│gate0│up0│gate1│up1│gate2│up2│gate3│up3│
    └─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┘

    with the main difference between V1 and V2 being the global load
    stride between warps, and between half-warps. Regarding the latter stride,
    we assign the first half warp of every warp for `gate` loads and the second
    half-warp to `up` loads.

    Returns `(y_q, y_s)` where
    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
    Let NUM_WARPS be the number of warps in a single thread block and
    `GROUP_SIZE = 128` be the size of the quantization group.
    """
    assert y.ndim == 3, "y must be (E, T, 2*H)"
    E, T, H2 = y.shape
    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
    H = H2 // 2
    G = (H + group_size - 1) // group_size
    assert H % 8 == 0, "H must be divisible by 8"
    assert group_size == 128, "H must be divisible by 8"
    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E

    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)

    fp8_dtype = torch.float8_e4m3fn
    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)

    stride_ys_e = T * G
    stride_ys_t = 1
    stride_ys_g = T
    y_s = torch.empty_strided(
        (E, T, G),
        (stride_ys_e, stride_ys_t, stride_ys_g),
        dtype=torch.float32,
        device=y.device,
    )

    use_ue8m0 = is_deep_gemm_e8m0_used()

    cuda_arch = current_platform.get_device_capability(
        device_id=y.device.index
    ).to_int()

    if cuda_arch >= 80:
        torch.ops._C.persistent_masked_m_silu_mul_quant(
            y, tokens_per_expert, y_q, y_s, use_ue8m0
        )
    else:
        stride_cnt_e = tokens_per_expert.stride()[0]

        # Static grid over experts and H-groups.
        # A loop inside the kernel handles the token dim
        grid = (E * G,)
        # strides (elements)
        stride_i_e, stride_i_t, stride_i_h = y.stride()
        stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()

        f_info = torch.finfo(fp8_dtype)
        fp8_max = f_info.max
        fp8_min = f_info.min
        eps: float = 1e-10
        _silu_mul_fp8_quant_deep_gemm[grid](
            y,
            y_q,
            y_s,
            tokens_per_expert,
            H,
            group_size,
            stride_i_e,
            stride_i_t,
            stride_i_h,
            stride_yq_e,
            stride_yq_t,
            stride_yq_h,
            stride_ys_e,
            stride_ys_t,
            stride_ys_g,
            stride_cnt_e,
            eps,
            fp8_min,
            fp8_max,
            is_deep_gemm_e8m0_used(),
            BLOCK=group_size,
            NUM_STAGES=4,
            num_warps=1,
        )

    return y_q, y_s

vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe ¶

logger module-attribute ¶

BatchedDeepGemmExperts ¶

activation_formats property ¶

max_num_tokens instance-attribute ¶

num_dispatchers instance-attribute ¶

__init__ ¶

apply ¶

finalize_weight_and_reduce_impl ¶

supports_chunking ¶

supports_expert_map ¶

workspace_shapes ¶

_silu_mul_fp8_quant_deep_gemm ¶

persistent_masked_m_silu_mul_quant ¶

logger `module-attribute` ¶

activation_formats `property` ¶

max_num_tokens `instance-attribute` ¶

num_dispatchers `instance-attribute` ¶

init ¶