vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe ¶

all `module-attribute` ¶

__all__ = [
    "CompressedTensorsMoEMethod",
    "CompressedTensorsW8A8Fp8MoEMethod",
    "CompressedTensorsW8A8Int8MoEMethod",
    "CompressedTensorsWNA16MarlinMoEMethod",
    "CompressedTensorsWNA16MoEMethod",
    "CompressedTensorsW4A4MoeMethod",
    "CompressedTensorsW4A8Int8MoEMethod",
]

logger `module-attribute` ¶

logger = init_logger(__name__)

CompressedTensorsMoEMethod ¶

Bases: FusedMoEMethodBase

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsMoEMethod(FusedMoEMethodBase):
    @staticmethod
    def get_moe_method(
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        layer: torch.nn.Module,
    ) -> "CompressedTensorsMoEMethod":
        # TODO: @dsikka: refactor this to use schemes as other kernels
        # are supported + check if the layer is being ignored.
        # Check if a using "Linear" to select schemes
        if "Linear" in quant_config.target_scheme_map:
            matched_target = "Linear"
        else:
            # May have instead defined the linear layers in the fused model

            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
            current_scheme = None
            for fused_layer in fused_layers:
                # Check if one of the fused layers are defined in quant_config
                matched_target = find_matched_target(
                    layer_name=fused_layer,
                    module=layer,
                    targets=quant_config.target_scheme_map.keys(),
                    fused_mapping=quant_config.packed_modules_mapping,
                )

                # Only valid if down_proj, gate_proj, and up_proj
                # are mapped to the same quant scheme in the quant_config
                if current_scheme is None:
                    current_scheme = quant_config.target_scheme_map.get(matched_target)
                else:
                    assert current_scheme == quant_config.target_scheme_map.get(
                        matched_target
                    )

        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
        input_quant = quant_config.target_scheme_map[matched_target].get(
            "input_activations"
        )

        if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
            # group_size=None means channelwise
            group_size = weight_quant.group_size or -1
            # Prefer to use the MarlinMoE kernel when it is supported.
            if (
                not check_moe_marlin_supports_layer(layer, group_size)
                or current_platform.is_rocm()
            ):
                if (
                    weight_quant.strategy == QuantizationStrategy.GROUP
                    and weight_quant.actorder
                    in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)
                ):
                    raise ValueError(
                        "WNA16MoE is not supported with actorder=group/dynamic."
                    )
                logger.info_once("Using CompressedTensorsWNA16MoEMethod")
                return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
            else:
                logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                return CompressedTensorsWNA16MarlinMoEMethod(
                    quant_config, layer.moe_config
                )
        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
        elif (
            quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
            or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
            or quant_config._is_fp8_w8a8(weight_quant, input_quant)
        ):
            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
        elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
            return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
        elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
        else:
            raise RuntimeError(
                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
            )

get_moe_method `staticmethod` ¶

get_moe_method(
    quant_config: CompressedTensorsConfig, layer: Module
) -> CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

@staticmethod
def get_moe_method(
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    layer: torch.nn.Module,
) -> "CompressedTensorsMoEMethod":
    # TODO: @dsikka: refactor this to use schemes as other kernels
    # are supported + check if the layer is being ignored.
    # Check if a using "Linear" to select schemes
    if "Linear" in quant_config.target_scheme_map:
        matched_target = "Linear"
    else:
        # May have instead defined the linear layers in the fused model

        fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
        current_scheme = None
        for fused_layer in fused_layers:
            # Check if one of the fused layers are defined in quant_config
            matched_target = find_matched_target(
                layer_name=fused_layer,
                module=layer,
                targets=quant_config.target_scheme_map.keys(),
                fused_mapping=quant_config.packed_modules_mapping,
            )

            # Only valid if down_proj, gate_proj, and up_proj
            # are mapped to the same quant scheme in the quant_config
            if current_scheme is None:
                current_scheme = quant_config.target_scheme_map.get(matched_target)
            else:
                assert current_scheme == quant_config.target_scheme_map.get(
                    matched_target
                )

    weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
    input_quant = quant_config.target_scheme_map[matched_target].get(
        "input_activations"
    )

    if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
        # group_size=None means channelwise
        group_size = weight_quant.group_size or -1
        # Prefer to use the MarlinMoE kernel when it is supported.
        if (
            not check_moe_marlin_supports_layer(layer, group_size)
            or current_platform.is_rocm()
        ):
            if (
                weight_quant.strategy == QuantizationStrategy.GROUP
                and weight_quant.actorder
                in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)
            ):
                raise ValueError(
                    "WNA16MoE is not supported with actorder=group/dynamic."
                )
            logger.info_once("Using CompressedTensorsWNA16MoEMethod")
            return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
        else:
            logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
            return CompressedTensorsWNA16MarlinMoEMethod(
                quant_config, layer.moe_config
            )
    elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
        return CompressedTensorsW4A4MoeMethod(layer.moe_config)
    elif (
        quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
        or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
        or quant_config._is_fp8_w8a8(weight_quant, input_quant)
    ):
        return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
    elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
        return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
    elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
        return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
    else:
        raise RuntimeError(
            f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
        )

CompressedTensorsW4A4MoeMethod ¶

Bases: CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
    def __init__(self, moe: FusedMoEConfig):
        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
            detect_nvfp4_moe_support,
        )

        super().__init__(moe)
        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
        self.allow_flashinfer = _nvfp4.allow_flashinfer
        self.use_marlin = _nvfp4.use_marlin
        self.group_size = 16
        self.flashinfer_moe_backend = None
        if self.allow_flashinfer:
            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
            logger.info_once(
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                " for CompressedTensorsW4A4MoeMethod."
            )

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        layer.num_experts = num_experts
        layer.params_dtype = params_dtype

        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                2 * intermediate_size_per_partition,
                # 2 fp4 items are packed in the input dimension
                hidden_size // 2,
                requires_grad=False,
                dtype=torch.uint8,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_packed", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                # 2 fp4 items are packed in the input dimension
                intermediate_size_per_partition // 2,
                dtype=torch.uint8,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_packed", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # Weight Scales
        w13_weight_scale = torch.nn.Parameter(
            torch.empty(
                num_experts,
                2 * intermediate_size_per_partition,
                # 2 fp4 items are packed in the input dimension
                hidden_size // self.group_size,
                dtype=torch.float8_e4m3fn,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
        )
        set_weight_attrs(w13_weight_scale, extra_weight_attrs)

        w2_weight_scale = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                # 2 fp4 items are packed in the input dimension
                intermediate_size_per_partition // self.group_size,
                dtype=torch.float8_e4m3fn,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
        )
        set_weight_attrs(w2_weight_scale, extra_weight_attrs)

        # Weight Global Scales
        w13_weight_scale_2 = torch.nn.Parameter(
            torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)

        w2_weight_scale_2 = torch.nn.Parameter(
            torch.empty(num_experts, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)

        # Input Global Scales
        w13_input_scale = torch.nn.Parameter(
            torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w13_input_global_scale", w13_input_scale)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        set_weight_attrs(w13_input_scale, extra_weight_attrs)

        w2_input_scale = torch.nn.Parameter(
            torch.empty(num_experts, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w2_input_global_scale", w2_input_scale)
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        set_weight_attrs(w2_input_scale, extra_weight_attrs)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # From packed to weight
        layer.w13_weight = torch.nn.Parameter(
            layer.w13_weight_packed.data, requires_grad=False
        )
        delattr(layer, "w13_weight_packed")

        layer.w2_weight = torch.nn.Parameter(
            layer.w2_weight_packed.data, requires_grad=False
        )
        delattr(layer, "w2_weight_packed")

        # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
        if self.allow_flashinfer:
            w, s = reorder_w1w3_to_w3w1(
                layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2
            )
            layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
            layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)

        if not torch.allclose(
            layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]
        ):
            logger.warning_once(
                "w1_weight_global_scale must match w3_weight_global_scale. "
                "Accuracy may be affected."
            )

        # Take inverse of global scale saved to disk
        layer.w13_weight_scale_2 = torch.nn.Parameter(
            1 / layer.w13_weight_global_scale[:, 0], requires_grad=False
        )

        layer.w2_weight_scale_2 = torch.nn.Parameter(
            1 / layer.w2_weight_global_scale.data, requires_grad=False
        )

        if self.use_marlin:
            prepare_moe_fp4_layer_for_marlin(layer)
            return
        # w13
        if (
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            w13_input_global_scale = (
                layer.w13_input_global_scale.min()
                .to(torch.float32)
                .expand(layer.num_experts)
            )
        else:
            w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
                torch.float32
            )
        layer.g1_alphas = torch.nn.Parameter(
            ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
            requires_grad=False,
        )

        layer.w13_input_scale_quant = torch.nn.Parameter(
            (w13_input_global_scale), requires_grad=False
        )

        # w2
        if (
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            w2_input_global_scale = (
                layer.w2_input_global_scale.min()
                .to(torch.float32)
                .expand(layer.num_experts)
            )
        else:
            w2_input_global_scale = layer.w2_input_global_scale

        layer.g2_alphas = torch.nn.Parameter(
            ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
            requires_grad=False,
        )

        layer.w2_input_scale_quant = torch.nn.Parameter(
            (w2_input_global_scale), requires_grad=False
        )

        # TensorRT-LLM specific processing
        if (
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            # Prepare static weights for TRT-LLM kernel
            # alternate: prepare_static_weight_layouts_for_trtllm_moe
            (
                gemm1_weights_fp4_shuffled,
                gemm1_scales_fp4_shuffled,
                gemm2_weights_fp4_shuffled,
                gemm2_scales_fp4_shuffled,
            ) = prepare_static_weights_for_trtllm_fp4_moe(
                layer.w13_weight,
                layer.w2_weight,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                layer.w2_weight.size(-2),  # hidden_size
                layer.w13_weight.size(-2) // 2,  # intermediate_size
                layer.w13_weight.size(0),  # num_experts
            )
            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")

            layer.gemm1_weights_fp4_shuffled = Parameter(
                gemm1_weights_fp4_shuffled, requires_grad=False
            )
            layer.gemm2_weights_fp4_shuffled = Parameter(
                gemm2_weights_fp4_shuffled, requires_grad=False
            )
            layer.gemm1_scales_fp4_shuffled = Parameter(
                gemm1_scales_fp4_shuffled, requires_grad=False
            )
            layer.gemm2_scales_fp4_shuffled = Parameter(
                gemm2_scales_fp4_shuffled, requires_grad=False
            )

            # Additional parameter needed for TRT-LLM
            layer.g1_scale_c = Parameter(
                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
                requires_grad=False,
            )

            # Clean up weights that won't be used by TRT-LLM
            del layer.w2_weight
            del layer.w2_weight_scale
            del layer.w13_weight
            del layer.w13_weight_scale
        else:
            # swizzle weight scales
            layer.w13_weight_scale = torch.nn.Parameter(
                swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
            )

            layer.w2_weight_scale = torch.nn.Parameter(
                swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
            )

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> mk.FusedMoEPrepareAndFinalize | None:
        if self.use_marlin or (
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            return None
        elif not self.allow_flashinfer:
            return super().maybe_make_prepare_finalize(routing_tables)

        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
        logger.debug_once("%s", prepare_finalize.__class__.__name__)
        return prepare_finalize

    def select_gemm_impl(
        self,
        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> mk.FusedMoEPermuteExpertsUnpermute:
        assert self.moe_quant_config is not None
        """Return the appropriate GEMM experts implementation."""
        experts = select_nvfp4_gemm_impl(
            self.moe,
            self.moe_quant_config,
            allow_flashinfer=self.allow_flashinfer,
        )
        logger.debug_once("Using %s", experts.__class__.__name__)
        return experts

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        if (
            self.use_marlin
            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            return None

        return nvfp4_moe_quant_config(
            g1_alphas=layer.g1_alphas,
            g2_alphas=layer.g2_alphas,
            a1_gscale=layer.w13_input_scale_quant,
            a2_gscale=layer.w2_input_scale_quant,
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
        )

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if enable_eplb:
            raise NotImplementedError(
                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
            )
        assert activation == "silu", "Only SiLU activation is supported."

        if (
            self.allow_flashinfer
            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):
            return flashinfer_trtllm_fp4_moe(
                layer=layer,
                x=x,
                router_logits=router_logits,
                top_k=top_k,
                global_num_experts=global_num_experts,
                num_expert_group=num_expert_group,
                topk_group=topk_group,
                custom_routing_function=custom_routing_function,
                e_score_correction_bias=e_score_correction_bias,
            )

        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            indices_type=self.topk_indices_dtype,
        )

        if self.use_marlin:
            return fused_marlin_moe(
                x,
                layer.w13_weight,
                layer.w2_weight,
                None,
                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
                topk_weights,
                topk_ids,
                global_scale1=layer.w13_weight_scale_2,
                global_scale2=layer.w2_weight_scale_2,
                quant_type_id=scalar_types.float4_e2m1f.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
                workspace=layer.workspace,
            )

        # FlashInfer fused experts path
        elif self.allow_flashinfer:
            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
                flashinfer_cutlass_moe_fp4,
            )

            assert is_valid_flashinfer_cutlass_fused_moe(
                x, layer.w13_weight, layer.w2_weight
            ), "Flashinfer CUTLASS Fused MoE not applicable!"

            assert self.moe_quant_config is not None

            return flashinfer_cutlass_moe_fp4(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                quant_config=self.moe_quant_config,
                inplace=False,  # TODO(shuw): fix later, now output is high prec
                activation=activation,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
                apply_router_weight_on_input=apply_router_weight_on_input,
            )
        else:
            from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4

            assert expert_map is None, (
                "Expert Parallelism / expert_map "
                "is currently not supported for "
                "CompressedTensorsW4A4MoeMethod."
            )
            assert self.moe_quant_config is not None

            # Cutlass moe takes in activations in BF16/Half precision
            # and fp4 quantized weights loaded from the checkpoint
            return cutlass_moe_fp4(
                a=x,
                w1_fp4=layer.w13_weight,
                w2_fp4=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                quant_config=self.moe_quant_config,
                apply_router_weight_on_input=apply_router_weight_on_input,
                # TODO(bnell): derive these from arguments
                m=x.shape[0],
                n=layer.w2_weight.shape[2] * 2,
                k=x.shape[1],
                e=layer.w13_weight.shape[0],
            ).to(x.dtype)

allow_flashinfer `instance-attribute` ¶

allow_flashinfer = allow_flashinfer

cutlass_nvfp4_supported `instance-attribute` ¶

cutlass_nvfp4_supported = cutlass_supported

flashinfer_moe_backend `instance-attribute` ¶

flashinfer_moe_backend = None

group_size `instance-attribute` ¶

group_size = 16

use_marlin `instance-attribute` ¶

use_marlin = use_marlin

init ¶

__init__(moe: FusedMoEConfig)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(self, moe: FusedMoEConfig):
    from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
        detect_nvfp4_moe_support,
    )

    super().__init__(moe)
    _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
    self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
    self.allow_flashinfer = _nvfp4.allow_flashinfer
    self.use_marlin = _nvfp4.use_marlin
    self.group_size = 16
    self.flashinfer_moe_backend = None
    if self.allow_flashinfer:
        self.flashinfer_moe_backend = get_flashinfer_moe_backend()
        logger.info_once(
            f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
            " for CompressedTensorsW4A4MoeMethod."
        )

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    if enable_eplb:
        raise NotImplementedError(
            "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
        )
    assert activation == "silu", "Only SiLU activation is supported."

    if (
        self.allow_flashinfer
        and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        return flashinfer_trtllm_fp4_moe(
            layer=layer,
            x=x,
            router_logits=router_logits,
            top_k=top_k,
            global_num_experts=global_num_experts,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            custom_routing_function=custom_routing_function,
            e_score_correction_bias=e_score_correction_bias,
        )

    topk_weights, topk_ids, _ = FusedMoE.select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        indices_type=self.topk_indices_dtype,
    )

    if self.use_marlin:
        return fused_marlin_moe(
            x,
            layer.w13_weight,
            layer.w2_weight,
            None,
            None,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            router_logits,
            topk_weights,
            topk_ids,
            global_scale1=layer.w13_weight_scale_2,
            global_scale2=layer.w2_weight_scale_2,
            quant_type_id=scalar_types.float4_e2m1f.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            workspace=layer.workspace,
        )

    # FlashInfer fused experts path
    elif self.allow_flashinfer:
        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
            flashinfer_cutlass_moe_fp4,
        )

        assert is_valid_flashinfer_cutlass_fused_moe(
            x, layer.w13_weight, layer.w2_weight
        ), "Flashinfer CUTLASS Fused MoE not applicable!"

        assert self.moe_quant_config is not None

        return flashinfer_cutlass_moe_fp4(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            quant_config=self.moe_quant_config,
            inplace=False,  # TODO(shuw): fix later, now output is high prec
            activation=activation,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            apply_router_weight_on_input=apply_router_weight_on_input,
        )
    else:
        from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4

        assert expert_map is None, (
            "Expert Parallelism / expert_map "
            "is currently not supported for "
            "CompressedTensorsW4A4MoeMethod."
        )
        assert self.moe_quant_config is not None

        # Cutlass moe takes in activations in BF16/Half precision
        # and fp4 quantized weights loaded from the checkpoint
        return cutlass_moe_fp4(
            a=x,
            w1_fp4=layer.w13_weight,
            w2_fp4=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            quant_config=self.moe_quant_config,
            apply_router_weight_on_input=apply_router_weight_on_input,
            # TODO(bnell): derive these from arguments
            m=x.shape[0],
            n=layer.w2_weight.shape[2] * 2,
            k=x.shape[1],
            e=layer.w13_weight.shape[0],
        ).to(x.dtype)

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    layer.num_experts = num_experts
    layer.params_dtype = params_dtype

    w13_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            # 2 fp4 items are packed in the input dimension
            hidden_size // 2,
            requires_grad=False,
            dtype=torch.uint8,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_packed", w13_weight)
    set_weight_attrs(w13_weight, extra_weight_attrs)

    w2_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            # 2 fp4 items are packed in the input dimension
            intermediate_size_per_partition // 2,
            dtype=torch.uint8,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_packed", w2_weight)
    set_weight_attrs(w2_weight, extra_weight_attrs)

    # Weight Scales
    w13_weight_scale = torch.nn.Parameter(
        torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            # 2 fp4 items are packed in the input dimension
            hidden_size // self.group_size,
            dtype=torch.float8_e4m3fn,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_scale", w13_weight_scale)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
    )
    set_weight_attrs(w13_weight_scale, extra_weight_attrs)

    w2_weight_scale = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            # 2 fp4 items are packed in the input dimension
            intermediate_size_per_partition // self.group_size,
            dtype=torch.float8_e4m3fn,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_scale", w2_weight_scale)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
    )
    set_weight_attrs(w2_weight_scale, extra_weight_attrs)

    # Weight Global Scales
    w13_weight_scale_2 = torch.nn.Parameter(
        torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
    )
    layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
    )
    set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)

    w2_weight_scale_2 = torch.nn.Parameter(
        torch.empty(num_experts, dtype=torch.float32), requires_grad=False
    )
    layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
    )
    set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)

    # Input Global Scales
    w13_input_scale = torch.nn.Parameter(
        torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
    )
    layer.register_parameter("w13_input_global_scale", w13_input_scale)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
    )
    set_weight_attrs(w13_input_scale, extra_weight_attrs)

    w2_input_scale = torch.nn.Parameter(
        torch.empty(num_experts, dtype=torch.float32), requires_grad=False
    )
    layer.register_parameter("w2_input_global_scale", w2_input_scale)
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
    )
    set_weight_attrs(w2_input_scale, extra_weight_attrs)

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    if (
        self.use_marlin
        or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        return None

    return nvfp4_moe_quant_config(
        g1_alphas=layer.g1_alphas,
        g2_alphas=layer.g2_alphas,
        a1_gscale=layer.w13_input_scale_quant,
        a2_gscale=layer.w2_input_scale_quant,
        w1_scale=layer.w13_weight_scale,
        w2_scale=layer.w2_weight_scale,
    )

maybe_make_prepare_finalize ¶

maybe_make_prepare_finalize(
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
) -> FusedMoEPrepareAndFinalize | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def maybe_make_prepare_finalize(
    self,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None:
    if self.use_marlin or (
        self.allow_flashinfer
        and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        return None
    elif not self.allow_flashinfer:
        return super().maybe_make_prepare_finalize(routing_tables)

    prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
    logger.debug_once("%s", prepare_finalize.__class__.__name__)
    return prepare_finalize

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    # From packed to weight
    layer.w13_weight = torch.nn.Parameter(
        layer.w13_weight_packed.data, requires_grad=False
    )
    delattr(layer, "w13_weight_packed")

    layer.w2_weight = torch.nn.Parameter(
        layer.w2_weight_packed.data, requires_grad=False
    )
    delattr(layer, "w2_weight_packed")

    # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
    if self.allow_flashinfer:
        w, s = reorder_w1w3_to_w3w1(
            layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2
        )
        layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
        layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)

    if not torch.allclose(
        layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]
    ):
        logger.warning_once(
            "w1_weight_global_scale must match w3_weight_global_scale. "
            "Accuracy may be affected."
        )

    # Take inverse of global scale saved to disk
    layer.w13_weight_scale_2 = torch.nn.Parameter(
        1 / layer.w13_weight_global_scale[:, 0], requires_grad=False
    )

    layer.w2_weight_scale_2 = torch.nn.Parameter(
        1 / layer.w2_weight_global_scale.data, requires_grad=False
    )

    if self.use_marlin:
        prepare_moe_fp4_layer_for_marlin(layer)
        return
    # w13
    if (
        self.allow_flashinfer
        and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        w13_input_global_scale = (
            layer.w13_input_global_scale.min()
            .to(torch.float32)
            .expand(layer.num_experts)
        )
    else:
        w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
            torch.float32
        )
    layer.g1_alphas = torch.nn.Parameter(
        ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
        requires_grad=False,
    )

    layer.w13_input_scale_quant = torch.nn.Parameter(
        (w13_input_global_scale), requires_grad=False
    )

    # w2
    if (
        self.allow_flashinfer
        and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        w2_input_global_scale = (
            layer.w2_input_global_scale.min()
            .to(torch.float32)
            .expand(layer.num_experts)
        )
    else:
        w2_input_global_scale = layer.w2_input_global_scale

    layer.g2_alphas = torch.nn.Parameter(
        ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
        requires_grad=False,
    )

    layer.w2_input_scale_quant = torch.nn.Parameter(
        (w2_input_global_scale), requires_grad=False
    )

    # TensorRT-LLM specific processing
    if (
        self.allow_flashinfer
        and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
    ):
        # Prepare static weights for TRT-LLM kernel
        # alternate: prepare_static_weight_layouts_for_trtllm_moe
        (
            gemm1_weights_fp4_shuffled,
            gemm1_scales_fp4_shuffled,
            gemm2_weights_fp4_shuffled,
            gemm2_scales_fp4_shuffled,
        ) = prepare_static_weights_for_trtllm_fp4_moe(
            layer.w13_weight,
            layer.w2_weight,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            layer.w2_weight.size(-2),  # hidden_size
            layer.w13_weight.size(-2) // 2,  # intermediate_size
            layer.w13_weight.size(0),  # num_experts
        )
        logger.debug_once("Finished shuffling weights for TRT-LLM MOE")

        layer.gemm1_weights_fp4_shuffled = Parameter(
            gemm1_weights_fp4_shuffled, requires_grad=False
        )
        layer.gemm2_weights_fp4_shuffled = Parameter(
            gemm2_weights_fp4_shuffled, requires_grad=False
        )
        layer.gemm1_scales_fp4_shuffled = Parameter(
            gemm1_scales_fp4_shuffled, requires_grad=False
        )
        layer.gemm2_scales_fp4_shuffled = Parameter(
            gemm2_scales_fp4_shuffled, requires_grad=False
        )

        # Additional parameter needed for TRT-LLM
        layer.g1_scale_c = Parameter(
            (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
            requires_grad=False,
        )

        # Clean up weights that won't be used by TRT-LLM
        del layer.w2_weight
        del layer.w2_weight_scale
        del layer.w13_weight
        del layer.w13_weight_scale
    else:
        # swizzle weight scales
        layer.w13_weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
        )

        layer.w2_weight_scale = torch.nn.Parameter(
            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
        )

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def select_gemm_impl(
    self,
    prepare_finalize: mk.FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> mk.FusedMoEPermuteExpertsUnpermute:
    assert self.moe_quant_config is not None
    """Return the appropriate GEMM experts implementation."""
    experts = select_nvfp4_gemm_impl(
        self.moe,
        self.moe_quant_config,
        allow_flashinfer=self.allow_flashinfer,
    )
    logger.debug_once("Using %s", experts.__class__.__name__)
    return experts

CompressedTensorsW4A8Int8MoEMethod ¶

Bases: CompressedTensorsMoEMethod

CPU-only MoE method using dynamic 4-bit matmul kernels on Arm Platform - Weights: int4 (stored as int8 values in [-8,7], packed to uint8 nibbles) - Scales: Fp32 for Channelwise , bf16 for groupwise quantization - Bias: Same data type as original weights - Activations: FP32/Bf16 dynamic per-token (A8 Int), quantized inside the kernel

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
    """
    CPU-only MoE method using dynamic 4-bit matmul kernels on Arm Platform
    - Weights: int4 (stored as int8 values in [-8,7], packed to uint8 nibbles)
    - Scales: Fp32 for Channelwise , bf16 for groupwise quantization
    - Bias: Same data type as original weights
    - Activations: FP32/Bf16 dynamic per-token (A8 Int),
      quantized inside the kernel
    """

    def __init__(
        self,
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        moe: FusedMoEConfig,
    ):
        super().__init__(moe)
        self.has_bias = self.moe.has_bias
        self.quant_config = quant_config

        # Validate scheme: weights=W4 (channel or group),
        # activations=dynamic TOKEN (A8)
        wq = self.quant_config.target_scheme_map["Linear"].get("weights")
        aq = self.quant_config.target_scheme_map["Linear"].get("input_activations")

        # Must be dynamic per-token activations
        if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic:
            raise ValueError(
                "W4A8-int MoE needs dynamic per-token activation quantization."
            )

        # Weight can be channel-wise (group_size=None) or group-wise
        self.group_size = wq.group_size if (wq.group_size is not None) else -1
        if wq.num_bits != 4:
            raise ValueError("This method only supports 4-bit weights (num_bits=4).")

        # CPU only
        if not current_platform.is_cpu():
            raise ValueError("CompressedTensorsW4A8Int8MoEMethod is CPU-only.")

        # Arm: check _dyn ops availability
        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
            try:
                _ = torch.ops.aten._dyn_quant_matmul_4bit
                _ = torch.ops.aten._dyn_quant_pack_4bit_weight
            except AttributeError as err:
                raise RuntimeError(
                    f"""PyTorch {torch.__version__} lacks _dyn_quant_* 4bit ops;
                    install a newer build."""
                ) from err
        self.static_input_scales = False  # always dynamic per token

    # ---- parameter creation ----
    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        # Shapes per local rank (TP/EP):
        #   w13: [E, 2*I_local, H]  int8  (int4 values in [-8,7])
        #   w2 : [E, H, I_local]    int8
        # Scales:
        #   channel-wise: group_size=-1 -> per-output-row, single scale per row
        #   group-wise  : group_size=g   ->
        #   per-output-row, (in_features/g) scales

        E = num_experts
        H = hidden_size
        IN = intermediate_size_per_partition
        g = self.group_size

        # Per-row scale columns
        def _n_scale_cols(in_features: int) -> int:
            return 1 if g == -1 else (in_features // g)

        # Register unpacked int4-as-int8 weights the loader will fill.
        w13 = torch.nn.Parameter(
            torch.empty(E, 2 * IN, H, dtype=torch.int8), requires_grad=False
        )
        set_weight_attrs(w13, extra_weight_attrs)
        layer.register_parameter("w13_weight", w13)

        w2 = torch.nn.Parameter(
            torch.empty(E, H, IN, dtype=torch.int8), requires_grad=False
        )
        set_weight_attrs(w2, extra_weight_attrs)
        layer.register_parameter("w2_weight", w2)

        # Register scales
        # KleidiAI groupwise kernels accepts float32 scales
        # KleidiAI groupwise kernels accepts bfloat16 scales
        scale_dtype = torch.float32 if g == -1 else torch.bfloat16

        w13_s = torch.nn.Parameter(
            torch.ones(E, 2 * IN, _n_scale_cols(H), dtype=scale_dtype),
            requires_grad=False,
        )
        set_weight_attrs(
            w13_s,
            {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
        )
        layer.register_parameter("w13_weight_scale", w13_s)

        w2_s = torch.nn.Parameter(
            torch.ones(E, H, _n_scale_cols(IN), dtype=scale_dtype), requires_grad=False
        )
        set_weight_attrs(
            w2_s,
            {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
        )
        layer.register_parameter("w2_weight_scale", w2_s)

        if self.has_bias:
            w13_bias = torch.nn.Parameter(
                torch.zeros(E, 2 * IN, dtype=params_dtype), requires_grad=False
            )
            layer.register_parameter("w13_bias", w13_bias)
            set_weight_attrs(w13_bias, extra_weight_attrs)

            w2_bias = torch.nn.Parameter(
                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w2_bias", w2_bias)
            set_weight_attrs(w2_bias, extra_weight_attrs)

        # Placeholders for packed weights (will be replaced after packing)
        layer.register_parameter(
            "w13_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
        )
        set_weight_attrs(layer.w13_weight_packed, extra_weight_attrs)

        layer.register_parameter(
            "w2_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
        )
        set_weight_attrs(layer.w2_weight_packed, extra_weight_attrs)

        # dims for 4 bit fused matmuls
        layer.w13_in_features = H
        layer.w13_out_features = 2 * IN
        layer.w2_in_features = IN
        layer.w2_out_features = H
        layer.group_size = g

    # post-load packing to dyn-4bit KleidiAI kernel's format
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        E = layer.w13_weight.shape[0]
        H = layer.w13_in_features
        I2 = layer.w13_out_features
        IN = layer.w2_in_features
        g = layer.group_size

        def _pack_matrix(
            int4_as_int8_2d: torch.Tensor,
            scales_2d: torch.Tensor,
            bias_1d: torch.Tensor | None,
            in_features: int,
            out_features: int,
        ) -> torch.Tensor:
            # int4 values are stored as int8 in [-8,7].
            # Shift to unsigned nibble and pack pairs along input-dim.
            tmp = int4_as_int8_2d.add(8)  # [out, in]
            uint8_nibbles = ((tmp[:, 1::2] << 4) | tmp[:, ::2]).to(
                torch.uint8
            )  # [out, in//2]

            # KleidiAI groupwise kernels accepts float32 scales
            # KleidiAI groupwise kernels accepts bfloat16 scales
            scale_dtype = torch.float32 if g == -1 else torch.bfloat16
            scales = scales_2d.to(scale_dtype)
            bias = None if bias_1d is None else bias_1d.to(torch.float32)
            return torch.ops.aten._dyn_quant_pack_4bit_weight(
                uint8_nibbles,
                scales,
                bias,
                g if g != -1 else in_features,
                in_features,
                out_features,
            )

        # Pack per expert
        w13_packed_list = []
        w2_packed_list = []

        has_w13_bias = hasattr(layer, "w13_bias") and layer.w13_bias is not None
        has_w2_bias = hasattr(layer, "w2_bias") and layer.w2_bias is not None

        for e in range(E):
            w13_packed_list.append(
                _pack_matrix(
                    layer.w13_weight[e],  # [2I, H]
                    layer.w13_weight_scale[e],  # [2I, H/g or 1]
                    layer.w13_bias[e] if has_w13_bias else None,  # [2I]
                    H,
                    I2,
                )
            )
            w2_packed_list.append(
                _pack_matrix(
                    # w2 shape is [H, IN]; we need [out, in] == [H, IN].
                    layer.w2_weight[e],  # [H, IN]
                    layer.w2_weight_scale[e],  # [H, IN/g or 1]
                    layer.w2_bias[e] if has_w2_bias else None,  # [H]
                    IN,
                    layer.w2_out_features,  # in_features=IN, out_features=H
                )
            )

        # each packed tensor has identical shape per expert; stack on dim 0
        w13_packed = torch.stack(w13_packed_list, dim=0)
        w2_packed = torch.stack(w2_packed_list, dim=0)

        replace_parameter(
            layer,
            "w13_weight_packed",
            torch.nn.Parameter(w13_packed, requires_grad=False),
        )
        replace_parameter(
            layer,
            "w2_weight_packed",
            torch.nn.Parameter(w2_packed, requires_grad=False),
        )

        # free raw tensors/scales/bias now that they're packed into the payload.
        replace_parameter(
            layer, "w13_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
        )
        replace_parameter(
            layer, "w2_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
        )
        replace_parameter(
            layer,
            "w13_weight_scale",
            torch.nn.Parameter(torch.empty(0), requires_grad=False),
        )
        replace_parameter(
            layer,
            "w2_weight_scale",
            torch.nn.Parameter(torch.empty(0), requires_grad=False),
        )
        if has_w13_bias:
            replace_parameter(
                layer,
                "w13_bias",
                torch.nn.Parameter(torch.empty(0), requires_grad=False),
            )
        if has_w2_bias:
            replace_parameter(
                layer,
                "w2_bias",
                torch.nn.Parameter(torch.empty(0), requires_grad=False),
            )

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        # CPU dynamic 4-bit MoE path does not use modular kernels or
        # fused_experts; quant config is not needed.
        return None

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor:
        assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
        assert activation in ("silu", "swigluoai", "swiglu"), (
            "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
        )
        assert expert_map is None, """expert_map/EP not implemented
        for CPU dyn-4bit MoE."""

        def _act_kind(s: str) -> int:
            # 0 = SwiGLU_Gu (SiLU(g)*u), 1 = SwiGLU_Ug (SiLU(u)*g), 2 = SiLU
            if s == "swiglu":
                return 0
            if s == "swigluoai":
                return 1
            if s == "silu":
                return 2
            raise ValueError(f"Unknown activation '{s}'")

        # Apply topk softmax on router output
        topk_weights, topk_ids = select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
        )

        return torch.ops._C.dynamic_4bit_int_moe(
            x,
            topk_ids.to(torch.long),
            topk_weights,
            layer.w13_weight_packed,
            layer.w2_weight_packed,
            layer.w2_out_features,
            layer.w2_in_features,
            layer.w13_out_features,
            layer.group_size,
            apply_router_weight_on_input,
            int(_act_kind(activation)),
        )

group_size `instance-attribute` ¶

group_size = group_size if group_size is not None else -1

has_bias `instance-attribute` ¶

has_bias = has_bias

quant_config `instance-attribute` ¶

quant_config = quant_config

static_input_scales `instance-attribute` ¶

static_input_scales = False

init ¶

__init__(
    quant_config: CompressedTensorsConfig,
    moe: FusedMoEConfig,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(
    self,
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    moe: FusedMoEConfig,
):
    super().__init__(moe)
    self.has_bias = self.moe.has_bias
    self.quant_config = quant_config

    # Validate scheme: weights=W4 (channel or group),
    # activations=dynamic TOKEN (A8)
    wq = self.quant_config.target_scheme_map["Linear"].get("weights")
    aq = self.quant_config.target_scheme_map["Linear"].get("input_activations")

    # Must be dynamic per-token activations
    if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic:
        raise ValueError(
            "W4A8-int MoE needs dynamic per-token activation quantization."
        )

    # Weight can be channel-wise (group_size=None) or group-wise
    self.group_size = wq.group_size if (wq.group_size is not None) else -1
    if wq.num_bits != 4:
        raise ValueError("This method only supports 4-bit weights (num_bits=4).")

    # CPU only
    if not current_platform.is_cpu():
        raise ValueError("CompressedTensorsW4A8Int8MoEMethod is CPU-only.")

    # Arm: check _dyn ops availability
    if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
        try:
            _ = torch.ops.aten._dyn_quant_matmul_4bit
            _ = torch.ops.aten._dyn_quant_pack_4bit_weight
        except AttributeError as err:
            raise RuntimeError(
                f"""PyTorch {torch.__version__} lacks _dyn_quant_* 4bit ops;
                install a newer build."""
            ) from err
    self.static_input_scales = False  # always dynamic per token

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor:
    assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
    assert activation in ("silu", "swigluoai", "swiglu"), (
        "Only SiLU/SwiGLUGU/SwiGLUUG are supported."
    )
    assert expert_map is None, """expert_map/EP not implemented
    for CPU dyn-4bit MoE."""

    def _act_kind(s: str) -> int:
        # 0 = SwiGLU_Gu (SiLU(g)*u), 1 = SwiGLU_Ug (SiLU(u)*g), 2 = SiLU
        if s == "swiglu":
            return 0
        if s == "swigluoai":
            return 1
        if s == "silu":
            return 2
        raise ValueError(f"Unknown activation '{s}'")

    # Apply topk softmax on router output
    topk_weights, topk_ids = select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
    )

    return torch.ops._C.dynamic_4bit_int_moe(
        x,
        topk_ids.to(torch.long),
        topk_weights,
        layer.w13_weight_packed,
        layer.w2_weight_packed,
        layer.w2_out_features,
        layer.w2_in_features,
        layer.w13_out_features,
        layer.group_size,
        apply_router_weight_on_input,
        int(_act_kind(activation)),
    )

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    # Shapes per local rank (TP/EP):
    #   w13: [E, 2*I_local, H]  int8  (int4 values in [-8,7])
    #   w2 : [E, H, I_local]    int8
    # Scales:
    #   channel-wise: group_size=-1 -> per-output-row, single scale per row
    #   group-wise  : group_size=g   ->
    #   per-output-row, (in_features/g) scales

    E = num_experts
    H = hidden_size
    IN = intermediate_size_per_partition
    g = self.group_size

    # Per-row scale columns
    def _n_scale_cols(in_features: int) -> int:
        return 1 if g == -1 else (in_features // g)

    # Register unpacked int4-as-int8 weights the loader will fill.
    w13 = torch.nn.Parameter(
        torch.empty(E, 2 * IN, H, dtype=torch.int8), requires_grad=False
    )
    set_weight_attrs(w13, extra_weight_attrs)
    layer.register_parameter("w13_weight", w13)

    w2 = torch.nn.Parameter(
        torch.empty(E, H, IN, dtype=torch.int8), requires_grad=False
    )
    set_weight_attrs(w2, extra_weight_attrs)
    layer.register_parameter("w2_weight", w2)

    # Register scales
    # KleidiAI groupwise kernels accepts float32 scales
    # KleidiAI groupwise kernels accepts bfloat16 scales
    scale_dtype = torch.float32 if g == -1 else torch.bfloat16

    w13_s = torch.nn.Parameter(
        torch.ones(E, 2 * IN, _n_scale_cols(H), dtype=scale_dtype),
        requires_grad=False,
    )
    set_weight_attrs(
        w13_s,
        {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
    )
    layer.register_parameter("w13_weight_scale", w13_s)

    w2_s = torch.nn.Parameter(
        torch.ones(E, H, _n_scale_cols(IN), dtype=scale_dtype), requires_grad=False
    )
    set_weight_attrs(
        w2_s,
        {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs},
    )
    layer.register_parameter("w2_weight_scale", w2_s)

    if self.has_bias:
        w13_bias = torch.nn.Parameter(
            torch.zeros(E, 2 * IN, dtype=params_dtype), requires_grad=False
        )
        layer.register_parameter("w13_bias", w13_bias)
        set_weight_attrs(w13_bias, extra_weight_attrs)

        w2_bias = torch.nn.Parameter(
            torch.zeros(num_experts, hidden_size, dtype=params_dtype),
            requires_grad=False,
        )
        layer.register_parameter("w2_bias", w2_bias)
        set_weight_attrs(w2_bias, extra_weight_attrs)

    # Placeholders for packed weights (will be replaced after packing)
    layer.register_parameter(
        "w13_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
    )
    set_weight_attrs(layer.w13_weight_packed, extra_weight_attrs)

    layer.register_parameter(
        "w2_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False)
    )
    set_weight_attrs(layer.w2_weight_packed, extra_weight_attrs)

    # dims for 4 bit fused matmuls
    layer.w13_in_features = H
    layer.w13_out_features = 2 * IN
    layer.w2_in_features = IN
    layer.w2_out_features = H
    layer.group_size = g

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    # CPU dynamic 4-bit MoE path does not use modular kernels or
    # fused_experts; quant config is not needed.
    return None

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    E = layer.w13_weight.shape[0]
    H = layer.w13_in_features
    I2 = layer.w13_out_features
    IN = layer.w2_in_features
    g = layer.group_size

    def _pack_matrix(
        int4_as_int8_2d: torch.Tensor,
        scales_2d: torch.Tensor,
        bias_1d: torch.Tensor | None,
        in_features: int,
        out_features: int,
    ) -> torch.Tensor:
        # int4 values are stored as int8 in [-8,7].
        # Shift to unsigned nibble and pack pairs along input-dim.
        tmp = int4_as_int8_2d.add(8)  # [out, in]
        uint8_nibbles = ((tmp[:, 1::2] << 4) | tmp[:, ::2]).to(
            torch.uint8
        )  # [out, in//2]

        # KleidiAI groupwise kernels accepts float32 scales
        # KleidiAI groupwise kernels accepts bfloat16 scales
        scale_dtype = torch.float32 if g == -1 else torch.bfloat16
        scales = scales_2d.to(scale_dtype)
        bias = None if bias_1d is None else bias_1d.to(torch.float32)
        return torch.ops.aten._dyn_quant_pack_4bit_weight(
            uint8_nibbles,
            scales,
            bias,
            g if g != -1 else in_features,
            in_features,
            out_features,
        )

    # Pack per expert
    w13_packed_list = []
    w2_packed_list = []

    has_w13_bias = hasattr(layer, "w13_bias") and layer.w13_bias is not None
    has_w2_bias = hasattr(layer, "w2_bias") and layer.w2_bias is not None

    for e in range(E):
        w13_packed_list.append(
            _pack_matrix(
                layer.w13_weight[e],  # [2I, H]
                layer.w13_weight_scale[e],  # [2I, H/g or 1]
                layer.w13_bias[e] if has_w13_bias else None,  # [2I]
                H,
                I2,
            )
        )
        w2_packed_list.append(
            _pack_matrix(
                # w2 shape is [H, IN]; we need [out, in] == [H, IN].
                layer.w2_weight[e],  # [H, IN]
                layer.w2_weight_scale[e],  # [H, IN/g or 1]
                layer.w2_bias[e] if has_w2_bias else None,  # [H]
                IN,
                layer.w2_out_features,  # in_features=IN, out_features=H
            )
        )

    # each packed tensor has identical shape per expert; stack on dim 0
    w13_packed = torch.stack(w13_packed_list, dim=0)
    w2_packed = torch.stack(w2_packed_list, dim=0)

    replace_parameter(
        layer,
        "w13_weight_packed",
        torch.nn.Parameter(w13_packed, requires_grad=False),
    )
    replace_parameter(
        layer,
        "w2_weight_packed",
        torch.nn.Parameter(w2_packed, requires_grad=False),
    )

    # free raw tensors/scales/bias now that they're packed into the payload.
    replace_parameter(
        layer, "w13_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
    )
    replace_parameter(
        layer, "w2_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False)
    )
    replace_parameter(
        layer,
        "w13_weight_scale",
        torch.nn.Parameter(torch.empty(0), requires_grad=False),
    )
    replace_parameter(
        layer,
        "w2_weight_scale",
        torch.nn.Parameter(torch.empty(0), requires_grad=False),
    )
    if has_w13_bias:
        replace_parameter(
            layer,
            "w13_bias",
            torch.nn.Parameter(torch.empty(0), requires_grad=False),
        )
    if has_w2_bias:
        replace_parameter(
            layer,
            "w2_bias",
            torch.nn.Parameter(torch.empty(0), requires_grad=False),
        )

CompressedTensorsW8A8Fp8MoEMethod ¶

Bases: CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
    def __init__(
        self,
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        moe: FusedMoEConfig,
    ):
        super().__init__(moe)
        self.quant_config = quant_config
        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
            "input_activations"
        )

        per_tensor = (
            self.weight_quant.strategy == QuantizationStrategy.TENSOR
            and self.input_quant.strategy == QuantizationStrategy.TENSOR
        )
        per_channel = (
            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
            and self.input_quant.strategy == QuantizationStrategy.TOKEN
        )
        if not (per_tensor or per_channel):
            assert self.weight_quant.strategy == QuantizationStrategy.BLOCK
            self.weight_block_size = self.weight_quant.block_structure
            assert self.weight_quant.dynamic is not None
        else:
            self.weight_block_size = None
        self.block_quant = self.weight_block_size is not None

        self.static_input_scales = not self.input_quant.dynamic
        if self.static_input_scales and per_channel:
            raise ValueError(
                "For FP8 Fused MoE layer, we require either per tensor or "
                "channelwise, dynamic per token quantization."
            )

        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
        # kernel for fast weight-only FP8 quantization
        self.use_marlin = (
            not current_platform.has_device_capability(89)
            or envs.VLLM_TEST_FORCE_FP8_MARLIN
            and not self.block_quant
        )
        # Disable marlin for rocm
        if current_platform.is_rocm():
            self.use_marlin = False

        self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()

        # cutlass path
        self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
            self.weight_quant, self.input_quant
        )
        self.use_cutlass = not self.block_quant and (
            quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant)
            or self.is_fp8_w8a8_sm100
        )
        self.disable_expert_map = False

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        layer.intermediate_size_per_partition = intermediate_size_per_partition
        layer.hidden_size = hidden_size
        layer.num_experts = num_experts
        layer.orig_dtype = params_dtype
        layer.weight_block_size = None

        params_dtype = torch.float8_e4m3fn

        if self.block_quant:
            assert self.weight_block_size is not None
            layer.weight_block_size = self.weight_block_size
            tp_size = get_tensor_model_parallel_world_size()
            block_n, block_k = (
                self.weight_block_size[0],
                self.weight_block_size[1],
            )
            # NOTE: To ensure proper alignment of the block-wise quantization
            # scales, the output_size of the weights for both the gate and up
            # layers must be divisible by block_n.
            # Required by column parallel or enabling merged weights
            if intermediate_size_per_partition % block_n != 0:
                raise ValueError(
                    f"The output_size of gate's and up's weight = "
                    f"{intermediate_size_per_partition} is not divisible by "
                    f"weight quantization block_n = {block_n}."
                )
            if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
                # Required by row parallel
                raise ValueError(
                    f"The input_size of down's weight = "
                    f"{intermediate_size_per_partition} is not divisible by "
                    f"weight quantization block_k = {block_k}."
                )

        # WEIGHTS
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                2 * intermediate_size_per_partition,
                hidden_size,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # WEIGHT_SCALES
        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
            # Allocate 2 scales for w1 and w3 respectively.
            # They are combined to a single scale after weight loading.
            w13_weight_scale = torch.nn.Parameter(
                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
            )
            layer.register_parameter("w13_weight_scale", w13_weight_scale)
            w2_weight_scale = torch.nn.Parameter(
                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
            )
            layer.register_parameter("w2_weight_scale", w2_weight_scale)
            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
            extra_weight_attrs.update(
                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
            )
            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
            set_weight_attrs(w2_weight_scale, extra_weight_attrs)

        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
            w13_weight_scale = torch.nn.Parameter(
                torch.ones(
                    num_experts,
                    2 * intermediate_size_per_partition,
                    1,
                    dtype=torch.float32,
                ),
                requires_grad=False,
            )
            layer.register_parameter("w13_weight_scale", w13_weight_scale)
            w2_weight_scale = torch.nn.Parameter(
                torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
                requires_grad=False,
            )
            layer.register_parameter("w2_weight_scale", w2_weight_scale)
            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
            extra_weight_attrs.update(
                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
            )
            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
            set_weight_attrs(w2_weight_scale, extra_weight_attrs)

        elif self.weight_quant.strategy == QuantizationStrategy.BLOCK:
            w13_weight_scale = torch.nn.Parameter(
                torch.ones(
                    num_experts,
                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
                    (hidden_size + block_k - 1) // block_k,
                    dtype=torch.float32,
                ),
                requires_grad=False,
            )
            layer.register_parameter("w13_weight_scale", w13_weight_scale)
            w2_weight_scale = torch.nn.Parameter(
                torch.ones(
                    num_experts,
                    (hidden_size + block_n - 1) // block_n,
                    (intermediate_size_per_partition + block_k - 1) // block_k,
                    dtype=torch.float32,
                ),
                requires_grad=False,
            )
            layer.register_parameter("w2_weight_scale", w2_weight_scale)
            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
            extra_weight_attrs.update(
                {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
            )
            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
            set_weight_attrs(w2_weight_scale, extra_weight_attrs)

        # INPUT_SCALES
        if self.static_input_scales:
            w13_input_scale = torch.nn.Parameter(
                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
            )
            layer.register_parameter("w13_input_scale", w13_input_scale)
            set_weight_attrs(w13_input_scale, extra_weight_attrs)

            w2_input_scale = torch.nn.Parameter(
                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
            )
            layer.register_parameter("w2_input_scale", w2_input_scale)
            set_weight_attrs(w2_input_scale, extra_weight_attrs)
        else:
            layer.w13_input_scale = None
            layer.w2_input_scale = None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # Fp8 moe kernels require a single activation scale.
        # We take the max of all the scales in case they differ.
        if self.static_input_scales:
            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
            if layer.w13_input_scale is None or layer.w2_input_scale is None:
                raise ValueError(
                    "QuantConfig has static quantization, but found "
                    "activation scales are None."
                )
            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
                layer.w2_input_scale
            ):
                logger.warning_once(
                    "Found input_scales that are not equal for "
                    "fp8 MoE layer. Using the maximum across experts "
                    "for each layer."
                )
            layer.w13_input_scale = torch.nn.Parameter(
                layer.w13_input_scale.max(), requires_grad=False
            )
            layer.w2_input_scale = torch.nn.Parameter(
                layer.w2_input_scale.max(), requires_grad=False
            )

        if current_platform.is_fp8_fnuz():
            # Normalize the weights and scales
            w13_weight, w13_weight_scale, w13_input_scale = (
                normalize_e4m3fn_to_e4m3fnuz(
                    layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
                )
            )
            w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
                layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
            )
            # Reset the parameter
            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
            layer.w13_weight_scale = torch.nn.Parameter(
                w13_weight_scale, requires_grad=False
            )
            if w13_input_scale is not None:
                layer.w13_input_scale = torch.nn.Parameter(
                    w13_input_scale, requires_grad=False
                )
            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
            layer.w2_weight_scale = torch.nn.Parameter(
                w2_weight_scale, requires_grad=False
            )
            if w2_input_scale is not None:
                layer.w2_input_scale = torch.nn.Parameter(
                    w2_input_scale, requires_grad=False
                )

        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
        # for w13 per expert. Use max then dequant and requant each expert.
        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
            assert layer.w13_weight_scale is not None
            shard_size = layer.intermediate_size_per_partition
            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
            for expert_id in range(layer.local_num_experts):
                start = 0
                for shard_id in range(2):
                    dq_weight = per_tensor_dequantize(
                        layer.w13_weight[expert_id][start : start + shard_size, :],
                        layer.w13_weight_scale[expert_id][shard_id],
                    )
                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
                        ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
                    )
                    start += shard_size
            layer.w13_weight_scale = torch.nn.Parameter(
                max_w13_scales, requires_grad=False
            )

        # Property to determine if AITER is used
        if self.rocm_aiter_moe_enabled:
            # reshaping weights is required for aiter moe kernel.
            shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
                layer.w13_weight.data, layer.w2_weight.data
            )

            layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
            layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)

        elif self.use_marlin:
            prepare_moe_fp8_layer_for_marlin(layer, False)
            # Activations not quantized for marlin.
            del layer.w13_input_scale
            del layer.w2_input_scale

        if self.use_cutlass:
            assert self.weight_quant.strategy != QuantizationStrategy.BLOCK
            device = layer.w13_weight.device
            # ab_strides1 and c_strides2 are the same
            self.ab_strides1_c_strides2 = torch.full(
                (layer.local_num_experts,),
                layer.hidden_size,
                device=device,
                dtype=torch.int64,
            )
            self.ab_strides2 = torch.full(
                (layer.local_num_experts,),
                layer.intermediate_size_per_partition,
                device=device,
                dtype=torch.int64,
            )
            self.c_strides1 = torch.full(
                (layer.local_num_experts,),
                2 * layer.intermediate_size_per_partition,
                device=device,
                dtype=torch.int64,
            )

        if is_deep_gemm_e8m0_used() and self.block_quant:
            assert layer.weight_block_size is not None
            # Re-quantise the expert weights so their scales are UE8M0.
            block_sz = tuple(layer.weight_block_size)
            requant_weight_ue8m0_inplace(
                layer.w13_weight.data,
                layer.w13_weight_scale.data,
                block_sz,
            )
            requant_weight_ue8m0_inplace(
                layer.w2_weight.data,
                layer.w2_weight_scale.data,
                block_sz,
            )

            # Ensure column-major TMA alignment expected by DeepGEMM.
            if expert_weight_is_col_major(layer.w13_weight_scale):
                layer.w13_weight_scale = get_col_major_tma_aligned_tensor(
                    layer.w13_weight_scale
                )
            if expert_weight_is_col_major(layer.w2_weight_scale):
                layer.w2_weight_scale = get_col_major_tma_aligned_tensor(
                    layer.w2_weight_scale
                )

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> mk.FusedMoEPrepareAndFinalize | None:
        if self.use_marlin or self.rocm_aiter_moe_enabled:
            return None
        else:
            return super().maybe_make_prepare_finalize(routing_tables)

    def select_gemm_impl(
        self,
        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> FusedMoEPermuteExpertsUnpermute:
        # cutlass path
        assert self.moe_quant_config is not None
        if self.use_cutlass:
            from vllm.model_executor.layers.fused_moe import (
                CutlassBatchedExpertsFp8,
                CutlassExpertsFp8,
            )

            experts: FusedMoEPermuteExpertsUnpermute

            num_dispatchers = prepare_finalize.num_dispatchers()

            if (
                prepare_finalize.activation_format
                == FusedMoEActivationFormat.BatchedExperts
            ):
                logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__)
                experts = CutlassBatchedExpertsFp8(
                    self.moe.num_local_experts,
                    num_dispatchers,
                    self.moe.in_dtype,
                    ab_strides1=self.ab_strides1_c_strides2,
                    ab_strides2=self.ab_strides2,
                    c_strides1=self.c_strides1,
                    c_strides2=self.ab_strides1_c_strides2,
                    quant_config=self.moe_quant_config,
                )
            else:
                logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
                experts = CutlassExpertsFp8(
                    self.moe.in_dtype,
                    ab_strides1=self.ab_strides1_c_strides2,
                    ab_strides2=self.ab_strides2,
                    c_strides1=self.c_strides1,
                    c_strides2=self.ab_strides1_c_strides2,
                    quant_config=self.moe_quant_config,
                )

            self.disable_expert_map = (
                num_dispatchers > 1 or not experts.supports_expert_map()
            )

            return experts

        # triton path
        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
            BatchedTritonOrDeepGemmExperts,
        )
        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
            TritonOrDeepGemmExperts,
        )

        assert not self.rocm_aiter_moe_enabled and not self.use_marlin

        if (
            prepare_finalize.activation_format
            == FusedMoEActivationFormat.BatchedExperts
        ):
            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
            assert max_num_tokens_per_rank is not None

            logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
            return BatchedTritonOrDeepGemmExperts(
                max_num_tokens=max_num_tokens_per_rank,
                num_dispatchers=prepare_finalize.num_dispatchers(),
                quant_config=self.moe_quant_config,
                allow_deep_gemm=(
                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
                ),
            )
        else:
            logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
            return TritonOrDeepGemmExperts(
                self.moe_quant_config,
                allow_deep_gemm=(
                    envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
                ),
            )

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        if self.use_marlin:
            return None

        per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
        per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL

        return fp8_w8a8_moe_quant_config(
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            per_act_token_quant=per_act_token,
            per_out_ch_quant=per_channel_quant,
            block_shape=layer.weight_block_size,
        )

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if enable_eplb:
            assert expert_load_view is not None
            assert logical_to_physical_map is not None
            assert logical_replica_count is not None
            assert isinstance(layer, FusedMoE)

        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            indices_type=self.topk_indices_dtype,
            num_fused_shared_experts=layer.num_fused_shared_experts,
            enable_eplb=enable_eplb,
            expert_map=expert_map,
            expert_load_view=expert_load_view,
            logical_to_physical_map=logical_to_physical_map,
            logical_replica_count=logical_replica_count,
        )

        per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
        per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL

        if self.use_marlin:
            assert activation == "silu", f"{activation} not supported for Marlin MoE."
            return fused_marlin_moe(
                x,
                layer.w13_weight,
                layer.w2_weight,
                None,
                None,
                layer.w13_weight_scale,
                layer.w2_weight_scale,
                router_logits,
                topk_weights,
                topk_ids,
                quant_type_id=scalar_types.float8_e4m3fn.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
                workspace=layer.workspace,
            )

        elif self.rocm_aiter_moe_enabled:
            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
                rocm_aiter_fused_experts,
            )

            assert per_act_token == per_channel_quant
            assert self.moe_quant_config is not None
            return rocm_aiter_fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                activation=activation,
                apply_router_weight_on_input=apply_router_weight_on_input,
                expert_map=expert_map,
                quant_config=self.moe_quant_config,
            )

        # cutlass path
        elif self.use_cutlass:
            assert self.moe_quant_config is not None

            # small-batch fallback on SM100
            if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
                from vllm.model_executor.layers.fused_moe import fused_experts

                assert per_act_token == per_channel_quant
                return fused_experts(
                    hidden_states=x,
                    w1=layer.w13_weight,
                    w2=layer.w2_weight,
                    topk_weights=topk_weights,
                    topk_ids=topk_ids,
                    inplace=True,
                    activation=activation,
                    apply_router_weight_on_input=apply_router_weight_on_input,
                    global_num_experts=global_num_experts,
                    expert_map=None if self.disable_expert_map else expert_map,
                    quant_config=self.moe_quant_config,
                )
            else:
                from vllm.model_executor.layers.fused_moe.cutlass_moe import (
                    cutlass_moe_fp8,
                )

                assert per_act_token == per_channel_quant
                assert self.moe_quant_config is not None
                return cutlass_moe_fp8(
                    x,
                    layer.w13_weight,
                    layer.w2_weight,
                    topk_weights,
                    topk_ids,
                    quant_config=self.moe_quant_config,
                    activation=activation,
                    global_num_experts=global_num_experts,
                    expert_map=None if self.disable_expert_map else expert_map,
                    ab_strides1=self.ab_strides1_c_strides2,
                    ab_strides2=self.ab_strides2,
                    c_strides1=self.c_strides1,
                    c_strides2=self.ab_strides1_c_strides2,
                )

        else:
            from vllm.model_executor.layers.fused_moe import fused_experts

            assert per_act_token == per_channel_quant
            assert self.moe_quant_config is not None
            return fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                inplace=True,
                activation=activation,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
                quant_config=self.moe_quant_config,
            )

    @property
    def supports_eplb(self) -> bool:
        return True

block_quant `instance-attribute` ¶

block_quant = weight_block_size is not None

disable_expert_map `instance-attribute` ¶

disable_expert_map = False

input_quant `instance-attribute` ¶

input_quant = get('input_activations')

is_fp8_w8a8_sm100 `instance-attribute` ¶

is_fp8_w8a8_sm100 = _is_fp8_w8a8_sm100(
    weight_quant, input_quant
)

quant_config `instance-attribute` ¶

quant_config = quant_config

rocm_aiter_moe_enabled `instance-attribute` ¶

rocm_aiter_moe_enabled = is_fused_moe_enabled()

static_input_scales `instance-attribute` ¶

static_input_scales = not dynamic

supports_eplb `property` ¶

supports_eplb: bool

use_cutlass `instance-attribute` ¶

use_cutlass = not block_quant and (
    _is_fp8_w8a8_sm90(weight_quant, input_quant)
    or is_fp8_w8a8_sm100
)

use_marlin `instance-attribute` ¶

use_marlin = (
    not has_device_capability(89)
    or VLLM_TEST_FORCE_FP8_MARLIN
    and not block_quant
)

weight_block_size `instance-attribute` ¶

weight_block_size = block_structure

weight_quant `instance-attribute` ¶

weight_quant = get('weights')

init ¶

__init__(
    quant_config: CompressedTensorsConfig,
    moe: FusedMoEConfig,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(
    self,
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    moe: FusedMoEConfig,
):
    super().__init__(moe)
    self.quant_config = quant_config
    self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
    self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
        "input_activations"
    )

    per_tensor = (
        self.weight_quant.strategy == QuantizationStrategy.TENSOR
        and self.input_quant.strategy == QuantizationStrategy.TENSOR
    )
    per_channel = (
        self.weight_quant.strategy == QuantizationStrategy.CHANNEL
        and self.input_quant.strategy == QuantizationStrategy.TOKEN
    )
    if not (per_tensor or per_channel):
        assert self.weight_quant.strategy == QuantizationStrategy.BLOCK
        self.weight_block_size = self.weight_quant.block_structure
        assert self.weight_quant.dynamic is not None
    else:
        self.weight_block_size = None
    self.block_quant = self.weight_block_size is not None

    self.static_input_scales = not self.input_quant.dynamic
    if self.static_input_scales and per_channel:
        raise ValueError(
            "For FP8 Fused MoE layer, we require either per tensor or "
            "channelwise, dynamic per token quantization."
        )

    # For GPUs that lack FP8 hardware support, we can leverage the Marlin
    # kernel for fast weight-only FP8 quantization
    self.use_marlin = (
        not current_platform.has_device_capability(89)
        or envs.VLLM_TEST_FORCE_FP8_MARLIN
        and not self.block_quant
    )
    # Disable marlin for rocm
    if current_platform.is_rocm():
        self.use_marlin = False

    self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()

    # cutlass path
    self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100(
        self.weight_quant, self.input_quant
    )
    self.use_cutlass = not self.block_quant and (
        quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant)
        or self.is_fp8_w8a8_sm100
    )
    self.disable_expert_map = False

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    if enable_eplb:
        assert expert_load_view is not None
        assert logical_to_physical_map is not None
        assert logical_replica_count is not None
        assert isinstance(layer, FusedMoE)

    topk_weights, topk_ids, _ = FusedMoE.select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        indices_type=self.topk_indices_dtype,
        num_fused_shared_experts=layer.num_fused_shared_experts,
        enable_eplb=enable_eplb,
        expert_map=expert_map,
        expert_load_view=expert_load_view,
        logical_to_physical_map=logical_to_physical_map,
        logical_replica_count=logical_replica_count,
    )

    per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
    per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL

    if self.use_marlin:
        assert activation == "silu", f"{activation} not supported for Marlin MoE."
        return fused_marlin_moe(
            x,
            layer.w13_weight,
            layer.w2_weight,
            None,
            None,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            router_logits,
            topk_weights,
            topk_ids,
            quant_type_id=scalar_types.float8_e4m3fn.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            workspace=layer.workspace,
        )

    elif self.rocm_aiter_moe_enabled:
        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
            rocm_aiter_fused_experts,
        )

        assert per_act_token == per_channel_quant
        assert self.moe_quant_config is not None
        return rocm_aiter_fused_experts(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            activation=activation,
            apply_router_weight_on_input=apply_router_weight_on_input,
            expert_map=expert_map,
            quant_config=self.moe_quant_config,
        )

    # cutlass path
    elif self.use_cutlass:
        assert self.moe_quant_config is not None

        # small-batch fallback on SM100
        if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8:
            from vllm.model_executor.layers.fused_moe import fused_experts

            assert per_act_token == per_channel_quant
            return fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                inplace=True,
                activation=activation,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
                expert_map=None if self.disable_expert_map else expert_map,
                quant_config=self.moe_quant_config,
            )
        else:
            from vllm.model_executor.layers.fused_moe.cutlass_moe import (
                cutlass_moe_fp8,
            )

            assert per_act_token == per_channel_quant
            assert self.moe_quant_config is not None
            return cutlass_moe_fp8(
                x,
                layer.w13_weight,
                layer.w2_weight,
                topk_weights,
                topk_ids,
                quant_config=self.moe_quant_config,
                activation=activation,
                global_num_experts=global_num_experts,
                expert_map=None if self.disable_expert_map else expert_map,
                ab_strides1=self.ab_strides1_c_strides2,
                ab_strides2=self.ab_strides2,
                c_strides1=self.c_strides1,
                c_strides2=self.ab_strides1_c_strides2,
            )

    else:
        from vllm.model_executor.layers.fused_moe import fused_experts

        assert per_act_token == per_channel_quant
        assert self.moe_quant_config is not None
        return fused_experts(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=True,
            activation=activation,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            quant_config=self.moe_quant_config,
        )

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    layer.intermediate_size_per_partition = intermediate_size_per_partition
    layer.hidden_size = hidden_size
    layer.num_experts = num_experts
    layer.orig_dtype = params_dtype
    layer.weight_block_size = None

    params_dtype = torch.float8_e4m3fn

    if self.block_quant:
        assert self.weight_block_size is not None
        layer.weight_block_size = self.weight_block_size
        tp_size = get_tensor_model_parallel_world_size()
        block_n, block_k = (
            self.weight_block_size[0],
            self.weight_block_size[1],
        )
        # NOTE: To ensure proper alignment of the block-wise quantization
        # scales, the output_size of the weights for both the gate and up
        # layers must be divisible by block_n.
        # Required by column parallel or enabling merged weights
        if intermediate_size_per_partition % block_n != 0:
            raise ValueError(
                f"The output_size of gate's and up's weight = "
                f"{intermediate_size_per_partition} is not divisible by "
                f"weight quantization block_n = {block_n}."
            )
        if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
            # Required by row parallel
            raise ValueError(
                f"The input_size of down's weight = "
                f"{intermediate_size_per_partition} is not divisible by "
                f"weight quantization block_k = {block_k}."
            )

    # WEIGHTS
    w13_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            hidden_size,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight", w13_weight)
    set_weight_attrs(w13_weight, extra_weight_attrs)

    w2_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            intermediate_size_per_partition,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight", w2_weight)
    set_weight_attrs(w2_weight, extra_weight_attrs)

    # WEIGHT_SCALES
    if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
        # Allocate 2 scales for w1 and w3 respectively.
        # They are combined to a single scale after weight loading.
        w13_weight_scale = torch.nn.Parameter(
            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
        w2_weight_scale = torch.nn.Parameter(
            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
        # Add PER-TENSOR quantization for FusedMoE.weight_loader.
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
        )
        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
        set_weight_attrs(w2_weight_scale, extra_weight_attrs)

    elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
        w13_weight_scale = torch.nn.Parameter(
            torch.ones(
                num_experts,
                2 * intermediate_size_per_partition,
                1,
                dtype=torch.float32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
        w2_weight_scale = torch.nn.Parameter(
            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
        )
        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
        set_weight_attrs(w2_weight_scale, extra_weight_attrs)

    elif self.weight_quant.strategy == QuantizationStrategy.BLOCK:
        w13_weight_scale = torch.nn.Parameter(
            torch.ones(
                num_experts,
                2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
                (hidden_size + block_k - 1) // block_k,
                dtype=torch.float32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
        w2_weight_scale = torch.nn.Parameter(
            torch.ones(
                num_experts,
                (hidden_size + block_n - 1) // block_n,
                (intermediate_size_per_partition + block_k - 1) // block_k,
                dtype=torch.float32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
        )
        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
        set_weight_attrs(w2_weight_scale, extra_weight_attrs)

    # INPUT_SCALES
    if self.static_input_scales:
        w13_input_scale = torch.nn.Parameter(
            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w13_input_scale", w13_input_scale)
        set_weight_attrs(w13_input_scale, extra_weight_attrs)

        w2_input_scale = torch.nn.Parameter(
            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
        )
        layer.register_parameter("w2_input_scale", w2_input_scale)
        set_weight_attrs(w2_input_scale, extra_weight_attrs)
    else:
        layer.w13_input_scale = None
        layer.w2_input_scale = None

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    if self.use_marlin:
        return None

    per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
    per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL

    return fp8_w8a8_moe_quant_config(
        w1_scale=layer.w13_weight_scale,
        w2_scale=layer.w2_weight_scale,
        a1_scale=layer.w13_input_scale,
        a2_scale=layer.w2_input_scale,
        per_act_token_quant=per_act_token,
        per_out_ch_quant=per_channel_quant,
        block_shape=layer.weight_block_size,
    )

maybe_make_prepare_finalize ¶

maybe_make_prepare_finalize(
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
) -> FusedMoEPrepareAndFinalize | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def maybe_make_prepare_finalize(
    self,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None:
    if self.use_marlin or self.rocm_aiter_moe_enabled:
        return None
    else:
        return super().maybe_make_prepare_finalize(routing_tables)

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    # Fp8 moe kernels require a single activation scale.
    # We take the max of all the scales in case they differ.
    if self.static_input_scales:
        assert self.input_quant.strategy == QuantizationStrategy.TENSOR
        if layer.w13_input_scale is None or layer.w2_input_scale is None:
            raise ValueError(
                "QuantConfig has static quantization, but found "
                "activation scales are None."
            )
        if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
            layer.w2_input_scale
        ):
            logger.warning_once(
                "Found input_scales that are not equal for "
                "fp8 MoE layer. Using the maximum across experts "
                "for each layer."
            )
        layer.w13_input_scale = torch.nn.Parameter(
            layer.w13_input_scale.max(), requires_grad=False
        )
        layer.w2_input_scale = torch.nn.Parameter(
            layer.w2_input_scale.max(), requires_grad=False
        )

    if current_platform.is_fp8_fnuz():
        # Normalize the weights and scales
        w13_weight, w13_weight_scale, w13_input_scale = (
            normalize_e4m3fn_to_e4m3fnuz(
                layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
            )
        )
        w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
            layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
        )
        # Reset the parameter
        layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
        layer.w13_weight_scale = torch.nn.Parameter(
            w13_weight_scale, requires_grad=False
        )
        if w13_input_scale is not None:
            layer.w13_input_scale = torch.nn.Parameter(
                w13_input_scale, requires_grad=False
            )
        layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
        layer.w2_weight_scale = torch.nn.Parameter(
            w2_weight_scale, requires_grad=False
        )
        if w2_input_scale is not None:
            layer.w2_input_scale = torch.nn.Parameter(
                w2_input_scale, requires_grad=False
            )

    # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
    # for w13 per expert. Use max then dequant and requant each expert.
    if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
        assert layer.w13_weight_scale is not None
        shard_size = layer.intermediate_size_per_partition
        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
        for expert_id in range(layer.local_num_experts):
            start = 0
            for shard_id in range(2):
                dq_weight = per_tensor_dequantize(
                    layer.w13_weight[expert_id][start : start + shard_size, :],
                    layer.w13_weight_scale[expert_id][shard_id],
                )
                layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
                    ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
                )
                start += shard_size
        layer.w13_weight_scale = torch.nn.Parameter(
            max_w13_scales, requires_grad=False
        )

    # Property to determine if AITER is used
    if self.rocm_aiter_moe_enabled:
        # reshaping weights is required for aiter moe kernel.
        shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
            layer.w13_weight.data, layer.w2_weight.data
        )

        layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False)
        layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)

    elif self.use_marlin:
        prepare_moe_fp8_layer_for_marlin(layer, False)
        # Activations not quantized for marlin.
        del layer.w13_input_scale
        del layer.w2_input_scale

    if self.use_cutlass:
        assert self.weight_quant.strategy != QuantizationStrategy.BLOCK
        device = layer.w13_weight.device
        # ab_strides1 and c_strides2 are the same
        self.ab_strides1_c_strides2 = torch.full(
            (layer.local_num_experts,),
            layer.hidden_size,
            device=device,
            dtype=torch.int64,
        )
        self.ab_strides2 = torch.full(
            (layer.local_num_experts,),
            layer.intermediate_size_per_partition,
            device=device,
            dtype=torch.int64,
        )
        self.c_strides1 = torch.full(
            (layer.local_num_experts,),
            2 * layer.intermediate_size_per_partition,
            device=device,
            dtype=torch.int64,
        )

    if is_deep_gemm_e8m0_used() and self.block_quant:
        assert layer.weight_block_size is not None
        # Re-quantise the expert weights so their scales are UE8M0.
        block_sz = tuple(layer.weight_block_size)
        requant_weight_ue8m0_inplace(
            layer.w13_weight.data,
            layer.w13_weight_scale.data,
            block_sz,
        )
        requant_weight_ue8m0_inplace(
            layer.w2_weight.data,
            layer.w2_weight_scale.data,
            block_sz,
        )

        # Ensure column-major TMA alignment expected by DeepGEMM.
        if expert_weight_is_col_major(layer.w13_weight_scale):
            layer.w13_weight_scale = get_col_major_tma_aligned_tensor(
                layer.w13_weight_scale
            )
        if expert_weight_is_col_major(layer.w2_weight_scale):
            layer.w2_weight_scale = get_col_major_tma_aligned_tensor(
                layer.w2_weight_scale
            )

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def select_gemm_impl(
    self,
    prepare_finalize: mk.FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> FusedMoEPermuteExpertsUnpermute:
    # cutlass path
    assert self.moe_quant_config is not None
    if self.use_cutlass:
        from vllm.model_executor.layers.fused_moe import (
            CutlassBatchedExpertsFp8,
            CutlassExpertsFp8,
        )

        experts: FusedMoEPermuteExpertsUnpermute

        num_dispatchers = prepare_finalize.num_dispatchers()

        if (
            prepare_finalize.activation_format
            == FusedMoEActivationFormat.BatchedExperts
        ):
            logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__)
            experts = CutlassBatchedExpertsFp8(
                self.moe.num_local_experts,
                num_dispatchers,
                self.moe.in_dtype,
                ab_strides1=self.ab_strides1_c_strides2,
                ab_strides2=self.ab_strides2,
                c_strides1=self.c_strides1,
                c_strides2=self.ab_strides1_c_strides2,
                quant_config=self.moe_quant_config,
            )
        else:
            logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
            experts = CutlassExpertsFp8(
                self.moe.in_dtype,
                ab_strides1=self.ab_strides1_c_strides2,
                ab_strides2=self.ab_strides2,
                c_strides1=self.c_strides1,
                c_strides2=self.ab_strides1_c_strides2,
                quant_config=self.moe_quant_config,
            )

        self.disable_expert_map = (
            num_dispatchers > 1 or not experts.supports_expert_map()
        )

        return experts

    # triton path
    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
        BatchedTritonOrDeepGemmExperts,
    )
    from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
        TritonOrDeepGemmExperts,
    )

    assert not self.rocm_aiter_moe_enabled and not self.use_marlin

    if (
        prepare_finalize.activation_format
        == FusedMoEActivationFormat.BatchedExperts
    ):
        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
        assert max_num_tokens_per_rank is not None

        logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
        return BatchedTritonOrDeepGemmExperts(
            max_num_tokens=max_num_tokens_per_rank,
            num_dispatchers=prepare_finalize.num_dispatchers(),
            quant_config=self.moe_quant_config,
            allow_deep_gemm=(
                envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
            ),
        )
    else:
        logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
        return TritonOrDeepGemmExperts(
            self.moe_quant_config,
            allow_deep_gemm=(
                envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
            ),
        )

CompressedTensorsW8A8Int8MoEMethod ¶

Bases: CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
    def __init__(
        self,
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        moe: FusedMoEConfig,
    ):
        super().__init__(moe)
        self.quant_config = quant_config
        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
            "input_activations"
        )

        per_channel = (
            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
            and self.input_quant.strategy == QuantizationStrategy.TOKEN
        )
        if not per_channel:
            raise ValueError(
                "For INT8 Fused MoE layers, we require channelwise, "
                "dynamic per token quantization. Found "
                f"{self.weight_quant}, {self.input_quant}"
            )

        self.static_input_scales = not self.input_quant.dynamic
        if self.static_input_scales:
            raise ValueError(
                "For INT8 Fused MoE layers, we require channelwise, "
                "dynamic per token quantization. Found static input scales."
            )

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        params_dtype = torch.int8

        # WEIGHTS
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                2 * intermediate_size_per_partition,
                hidden_size,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # WEIGHT_SCALES
        assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
        w13_weight_scale = torch.nn.Parameter(
            torch.ones(
                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
        w2_weight_scale = torch.nn.Parameter(
            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
        extra_weight_attrs.update(
            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
        )
        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
        set_weight_attrs(w2_weight_scale, extra_weight_attrs)

        # INPUT_SCALES
        assert not self.static_input_scales
        layer.w13_input_scale = None
        layer.w2_input_scale = None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        pass

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        return int8_w8a8_moe_quant_config(
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            per_act_token_quant=True,
        )

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if enable_eplb:
            raise NotImplementedError(
                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
            )

        from vllm.model_executor.layers.fused_moe import fused_experts

        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            indices_type=self.topk_indices_dtype,
        )

        return fused_experts(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=True,
            activation=activation,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            quant_config=self.moe_quant_config,
        )

input_quant `instance-attribute` ¶

input_quant = get('input_activations')

quant_config `instance-attribute` ¶

quant_config = quant_config

static_input_scales `instance-attribute` ¶

static_input_scales = not dynamic

weight_quant `instance-attribute` ¶

weight_quant = get('weights')

init ¶

__init__(
    quant_config: CompressedTensorsConfig,
    moe: FusedMoEConfig,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(
    self,
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    moe: FusedMoEConfig,
):
    super().__init__(moe)
    self.quant_config = quant_config
    self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
    self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
        "input_activations"
    )

    per_channel = (
        self.weight_quant.strategy == QuantizationStrategy.CHANNEL
        and self.input_quant.strategy == QuantizationStrategy.TOKEN
    )
    if not per_channel:
        raise ValueError(
            "For INT8 Fused MoE layers, we require channelwise, "
            "dynamic per token quantization. Found "
            f"{self.weight_quant}, {self.input_quant}"
        )

    self.static_input_scales = not self.input_quant.dynamic
    if self.static_input_scales:
        raise ValueError(
            "For INT8 Fused MoE layers, we require channelwise, "
            "dynamic per token quantization. Found static input scales."
        )

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    if enable_eplb:
        raise NotImplementedError(
            "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
        )

    from vllm.model_executor.layers.fused_moe import fused_experts

    topk_weights, topk_ids, _ = FusedMoE.select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        indices_type=self.topk_indices_dtype,
    )

    return fused_experts(
        hidden_states=x,
        w1=layer.w13_weight,
        w2=layer.w2_weight,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        inplace=True,
        activation=activation,
        apply_router_weight_on_input=apply_router_weight_on_input,
        global_num_experts=global_num_experts,
        expert_map=expert_map,
        quant_config=self.moe_quant_config,
    )

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    params_dtype = torch.int8

    # WEIGHTS
    w13_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            2 * intermediate_size_per_partition,
            hidden_size,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight", w13_weight)
    set_weight_attrs(w13_weight, extra_weight_attrs)

    w2_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            intermediate_size_per_partition,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight", w2_weight)
    set_weight_attrs(w2_weight, extra_weight_attrs)

    # WEIGHT_SCALES
    assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
    w13_weight_scale = torch.nn.Parameter(
        torch.ones(
            num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_scale", w13_weight_scale)
    w2_weight_scale = torch.nn.Parameter(
        torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_scale", w2_weight_scale)
    # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
    extra_weight_attrs.update(
        {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
    )
    set_weight_attrs(w13_weight_scale, extra_weight_attrs)
    set_weight_attrs(w2_weight_scale, extra_weight_attrs)

    # INPUT_SCALES
    assert not self.static_input_scales
    layer.w13_input_scale = None
    layer.w2_input_scale = None

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    return int8_w8a8_moe_quant_config(
        w1_scale=layer.w13_weight_scale,
        w2_scale=layer.w2_weight_scale,
        a1_scale=layer.w13_input_scale,
        a2_scale=layer.w2_input_scale,
        per_act_token_quant=True,
    )

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    pass

CompressedTensorsWNA16MarlinMoEMethod ¶

Bases: CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
    def __init__(
        self,
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        moe: FusedMoEConfig,
    ):
        super().__init__(moe)
        self.quant_config = quant_config
        # TODO: @dsikka: refactor this to use schemes as other kernels
        # are supported + check if the layer is being ignored.
        config = self.quant_config.target_scheme_map["Linear"].get("weights")
        self.num_bits = config.num_bits
        self.packed_factor = 32 // config.num_bits
        self.strategy = config.strategy
        self.group_size = config.group_size
        self.actorder = config.actorder
        assert config.symmetric, "Only symmetric quantization is supported for MoE"

        if not (
            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
            and self.num_bits in WNA16_SUPPORTED_BITS
        ):
            raise ValueError(
                "For Fused MoE layers, only ",
                f"{CompressionFormat.pack_quantized.value} ",
                "is supported for the following bits: ",
                f"{WNA16_SUPPORTED_BITS}",
            )
        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
        self.use_marlin = True

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")

        # Will transpose the loaded weight along the
        # intermediate and hidden dim sizes. Will
        # shard for TP along the transposed dims
        extra_weight_attrs.update(
            {"is_transposed": True, "quant_method": self.strategy}
        )
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size // self.packed_factor,
                2 * intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_packed", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition // self.packed_factor,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_packed", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # In the case where we have actorder/g_idx,
        # we do not partition the w2 scales
        load_full_w2 = self.actorder and self.group_size != -1
        w2_scales_size = (
            intermediate_size_full if load_full_w2 else intermediate_size_per_partition
        )

        self.is_k_full = (not self.actorder) or (
            intermediate_size_per_partition == intermediate_size_full
        )

        if self.strategy == "channel":
            num_groups_w2 = num_groups_w13 = 1
            self.group_size = -1
        else:
            num_groups_w2 = w2_scales_size // self.group_size
            num_groups_w13 = hidden_size // self.group_size

        w13_scale = torch.nn.Parameter(
            torch.ones(
                num_experts,
                num_groups_w13,
                2 * intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_scale)
        set_weight_attrs(w13_scale, extra_weight_attrs)

        w2_scale = torch.nn.Parameter(
            torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_scale)
        set_weight_attrs(w2_scale, extra_weight_attrs)
        set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})

        w2_weight_shape = torch.nn.Parameter(
            torch.empty(num_experts, 2), requires_grad=False
        )
        layer.register_parameter("w2_weight_shape", w2_weight_shape)
        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
        w13_weight_shape = torch.nn.Parameter(
            torch.empty(num_experts, 2), requires_grad=False
        )

        layer.register_parameter("w13_weight_shape", w13_weight_shape)
        set_weight_attrs(w13_weight_shape, extra_weight_attrs)

        w13_g_idx = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
        set_weight_attrs(w13_g_idx, extra_weight_attrs)

        w2_g_idx = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
        set_weight_attrs(w2_g_idx, extra_weight_attrs)

        w13_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)

        w2_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)

        layer.a13_scale = None
        layer.a2_scale = None
        layer.marlin_state = GPTQMarlinState.REPACK

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        num_experts = layer.w13_weight_g_idx.shape[0]
        device = layer.w13_weight_g_idx.device

        # when running models with grouped act order,
        # resort to g_idx values provided in checkpoint
        if self.actorder == "group":
            w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
            w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
            w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
            w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)

            for e in range(num_experts):
                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to(
                    torch.int32
                )
                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to(
                    torch.int32
                )
                w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
                    w13_g_idx_sort_indices[e]
                ]
                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]]

            replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
            replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)

        else:
            layer.w13_weight_g_idx = torch.nn.Parameter(
                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                requires_grad=False,
            )
            layer.w2_weight_g_idx = torch.nn.Parameter(
                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                requires_grad=False,
            )
            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                requires_grad=False,
            )
            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
                requires_grad=False,
            )

        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
            layer.w13_weight_packed,
            layer.w13_g_idx_sort_indices,
            layer.w13_weight_packed.shape[1] * self.packed_factor,
            layer.w13_weight_packed.shape[2],
            self.num_bits,
        )
        replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
            layer.w2_weight_packed,
            layer.w2_g_idx_sort_indices,
            layer.w2_weight_packed.shape[1] * self.packed_factor,
            layer.w2_weight_packed.shape[2],
            self.num_bits,
        )
        replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
        # Repack scales
        marlin_w13_scales = marlin_moe_permute_scales(
            s=layer.w13_weight_scale,
            size_k=layer.w13_weight_packed.shape[2],
            size_n=layer.w13_weight_scale.shape[2],
            group_size=self.group_size,
        )
        replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
        marlin_w2_scales = marlin_moe_permute_scales(
            s=layer.w2_weight_scale,
            size_k=layer.w2_weight_scale.shape[1]
            * (self.group_size if self.group_size != -1 else self.packed_factor),
            size_n=layer.w2_weight_scale.shape[2],
            group_size=self.group_size,
        )
        replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)

        layer.workspace = marlin_make_workspace_new(device, 4)

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        if self.num_bits != 4:
            return None
        return int4_w4a16_moe_quant_config(
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
            w1_zp=None,
            w2_zp=None,
            block_shape=[0, self.group_size],
        )

    def select_gemm_impl(
        self,
        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> mk.FusedMoEPermuteExpertsUnpermute:
        assert self.num_bits == 4, "only supporting w4"
        layer.w13_weight = layer.w13_weight_packed
        layer.w2_weight = layer.w2_weight_packed
        assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
        assert self.moe_quant_config is not None
        if (
            prepare_finalize.activation_format
            == mk.FusedMoEActivationFormat.BatchedExperts
        ):
            max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
            assert max_num_tokens_per_rank is not None
            return BatchedMarlinExperts(
                max_num_tokens=max_num_tokens_per_rank,
                num_dispatchers=prepare_finalize.num_dispatchers(),
                quant_config=self.moe_quant_config,
                w13_g_idx=layer.w13_weight_g_idx,
                w2_g_idx=layer.w2_weight_g_idx,
                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
                is_k_full=self.is_k_full,
            )
        else:
            return MarlinExperts(
                quant_config=self.moe_quant_config,
                w13_g_idx=layer.w13_weight_g_idx,
                w2_g_idx=layer.w2_weight_g_idx,
                w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
                w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
                is_k_full=self.is_k_full,
            )

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if enable_eplb:
            raise NotImplementedError(
                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
            )

        assert activation == "silu", f"{activation} not supported for Marlin MoE."

        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            indices_type=self.topk_indices_dtype,
        )

        return fused_marlin_moe(
            x,
            layer.w13_weight_packed,
            layer.w2_weight_packed,
            None,
            None,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            router_logits,
            topk_weights,
            topk_ids,
            quant_type_id=self.quant_type.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            g_idx1=layer.w13_weight_g_idx,
            g_idx2=layer.w2_weight_g_idx,
            sort_indices1=layer.w13_g_idx_sort_indices,
            sort_indices2=layer.w2_g_idx_sort_indices,
            workspace=layer.workspace,
            is_k_full=self.is_k_full,
        )

actorder `instance-attribute` ¶

actorder = actorder

group_size `instance-attribute` ¶

group_size = group_size

num_bits `instance-attribute` ¶

num_bits = num_bits

packed_factor `instance-attribute` ¶

packed_factor = 32 // num_bits

quant_config `instance-attribute` ¶

quant_config = quant_config

quant_type `instance-attribute` ¶

quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]

strategy `instance-attribute` ¶

strategy = strategy

use_marlin `instance-attribute` ¶

use_marlin = True

init ¶

__init__(
    quant_config: CompressedTensorsConfig,
    moe: FusedMoEConfig,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(
    self,
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    moe: FusedMoEConfig,
):
    super().__init__(moe)
    self.quant_config = quant_config
    # TODO: @dsikka: refactor this to use schemes as other kernels
    # are supported + check if the layer is being ignored.
    config = self.quant_config.target_scheme_map["Linear"].get("weights")
    self.num_bits = config.num_bits
    self.packed_factor = 32 // config.num_bits
    self.strategy = config.strategy
    self.group_size = config.group_size
    self.actorder = config.actorder
    assert config.symmetric, "Only symmetric quantization is supported for MoE"

    if not (
        self.quant_config.quant_format == CompressionFormat.pack_quantized.value
        and self.num_bits in WNA16_SUPPORTED_BITS
    ):
        raise ValueError(
            "For Fused MoE layers, only ",
            f"{CompressionFormat.pack_quantized.value} ",
            "is supported for the following bits: ",
            f"{WNA16_SUPPORTED_BITS}",
        )
    self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
    self.use_marlin = True

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    if enable_eplb:
        raise NotImplementedError(
            "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
        )

    assert activation == "silu", f"{activation} not supported for Marlin MoE."

    topk_weights, topk_ids, _ = FusedMoE.select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        indices_type=self.topk_indices_dtype,
    )

    return fused_marlin_moe(
        x,
        layer.w13_weight_packed,
        layer.w2_weight_packed,
        None,
        None,
        layer.w13_weight_scale,
        layer.w2_weight_scale,
        router_logits,
        topk_weights,
        topk_ids,
        quant_type_id=self.quant_type.id,
        apply_router_weight_on_input=apply_router_weight_on_input,
        global_num_experts=global_num_experts,
        expert_map=expert_map,
        g_idx1=layer.w13_weight_g_idx,
        g_idx2=layer.w2_weight_g_idx,
        sort_indices1=layer.w13_g_idx_sort_indices,
        sort_indices2=layer.w2_g_idx_sort_indices,
        workspace=layer.workspace,
        is_k_full=self.is_k_full,
    )

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")

    # Will transpose the loaded weight along the
    # intermediate and hidden dim sizes. Will
    # shard for TP along the transposed dims
    extra_weight_attrs.update(
        {"is_transposed": True, "quant_method": self.strategy}
    )
    w13_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size // self.packed_factor,
            2 * intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_packed", w13_weight)
    set_weight_attrs(w13_weight, extra_weight_attrs)

    w2_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition // self.packed_factor,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_packed", w2_weight)
    set_weight_attrs(w2_weight, extra_weight_attrs)

    # In the case where we have actorder/g_idx,
    # we do not partition the w2 scales
    load_full_w2 = self.actorder and self.group_size != -1
    w2_scales_size = (
        intermediate_size_full if load_full_w2 else intermediate_size_per_partition
    )

    self.is_k_full = (not self.actorder) or (
        intermediate_size_per_partition == intermediate_size_full
    )

    if self.strategy == "channel":
        num_groups_w2 = num_groups_w13 = 1
        self.group_size = -1
    else:
        num_groups_w2 = w2_scales_size // self.group_size
        num_groups_w13 = hidden_size // self.group_size

    w13_scale = torch.nn.Parameter(
        torch.ones(
            num_experts,
            num_groups_w13,
            2 * intermediate_size_per_partition,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_scale", w13_scale)
    set_weight_attrs(w13_scale, extra_weight_attrs)

    w2_scale = torch.nn.Parameter(
        torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_scale", w2_scale)
    set_weight_attrs(w2_scale, extra_weight_attrs)
    set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})

    w2_weight_shape = torch.nn.Parameter(
        torch.empty(num_experts, 2), requires_grad=False
    )
    layer.register_parameter("w2_weight_shape", w2_weight_shape)
    set_weight_attrs(w2_weight_shape, extra_weight_attrs)
    w13_weight_shape = torch.nn.Parameter(
        torch.empty(num_experts, 2), requires_grad=False
    )

    layer.register_parameter("w13_weight_shape", w13_weight_shape)
    set_weight_attrs(w13_weight_shape, extra_weight_attrs)

    w13_g_idx = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_g_idx", w13_g_idx)
    set_weight_attrs(w13_g_idx, extra_weight_attrs)

    w2_g_idx = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_g_idx", w2_g_idx)
    set_weight_attrs(w2_g_idx, extra_weight_attrs)

    w13_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
    set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)

    w2_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
    set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)

    layer.a13_scale = None
    layer.a2_scale = None
    layer.marlin_state = GPTQMarlinState.REPACK

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    if self.num_bits != 4:
        return None
    return int4_w4a16_moe_quant_config(
        w1_scale=layer.w13_weight_scale,
        w2_scale=layer.w2_weight_scale,
        w1_zp=None,
        w2_zp=None,
        block_shape=[0, self.group_size],
    )

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    num_experts = layer.w13_weight_g_idx.shape[0]
    device = layer.w13_weight_g_idx.device

    # when running models with grouped act order,
    # resort to g_idx values provided in checkpoint
    if self.actorder == "group":
        w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
        w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
        w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
        w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)

        for e in range(num_experts):
            w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to(
                torch.int32
            )
            w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to(
                torch.int32
            )
            w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
                w13_g_idx_sort_indices[e]
            ]
            w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]]

        replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
        replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
        replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
        replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)

    else:
        layer.w13_weight_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        layer.w2_weight_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )

    marlin_w13_qweight = ops.gptq_marlin_moe_repack(
        layer.w13_weight_packed,
        layer.w13_g_idx_sort_indices,
        layer.w13_weight_packed.shape[1] * self.packed_factor,
        layer.w13_weight_packed.shape[2],
        self.num_bits,
    )
    replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
    marlin_w2_qweight = ops.gptq_marlin_moe_repack(
        layer.w2_weight_packed,
        layer.w2_g_idx_sort_indices,
        layer.w2_weight_packed.shape[1] * self.packed_factor,
        layer.w2_weight_packed.shape[2],
        self.num_bits,
    )
    replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
    # Repack scales
    marlin_w13_scales = marlin_moe_permute_scales(
        s=layer.w13_weight_scale,
        size_k=layer.w13_weight_packed.shape[2],
        size_n=layer.w13_weight_scale.shape[2],
        group_size=self.group_size,
    )
    replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
    marlin_w2_scales = marlin_moe_permute_scales(
        s=layer.w2_weight_scale,
        size_k=layer.w2_weight_scale.shape[1]
        * (self.group_size if self.group_size != -1 else self.packed_factor),
        size_n=layer.w2_weight_scale.shape[2],
        group_size=self.group_size,
    )
    replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)

    layer.workspace = marlin_make_workspace_new(device, 4)

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def select_gemm_impl(
    self,
    prepare_finalize: mk.FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> mk.FusedMoEPermuteExpertsUnpermute:
    assert self.num_bits == 4, "only supporting w4"
    layer.w13_weight = layer.w13_weight_packed
    layer.w2_weight = layer.w2_weight_packed
    assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]])
    assert self.moe_quant_config is not None
    if (
        prepare_finalize.activation_format
        == mk.FusedMoEActivationFormat.BatchedExperts
    ):
        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
        assert max_num_tokens_per_rank is not None
        return BatchedMarlinExperts(
            max_num_tokens=max_num_tokens_per_rank,
            num_dispatchers=prepare_finalize.num_dispatchers(),
            quant_config=self.moe_quant_config,
            w13_g_idx=layer.w13_weight_g_idx,
            w2_g_idx=layer.w2_weight_g_idx,
            w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
            w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
            is_k_full=self.is_k_full,
        )
    else:
        return MarlinExperts(
            quant_config=self.moe_quant_config,
            w13_g_idx=layer.w13_weight_g_idx,
            w2_g_idx=layer.w2_weight_g_idx,
            w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
            w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
            is_k_full=self.is_k_full,
        )

CompressedTensorsWNA16MoEMethod ¶

Bases: CompressedTensorsMoEMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
    def __init__(
        self,
        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
        moe: FusedMoEConfig,
    ):
        super().__init__(moe)
        self.quant_config = quant_config
        # TODO: @dsikka: refactor this to use schemes as other kernels
        # are supported + check if the layer is being ignored.
        config = self.quant_config.target_scheme_map["Linear"].get("weights")
        self.num_bits = config.num_bits
        self.packed_factor = 32 // config.num_bits
        self.strategy = config.strategy
        # channelwise is not supported by this kernel
        assert config.strategy == "group"
        self.group_size = config.group_size
        # grouped actorder isn't supported by this kernel
        assert config.actorder != "group"
        assert config.symmetric, "Only symmetric quantization is supported for MoE"

        if not (
            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
            and self.num_bits in WNA16_SUPPORTED_BITS
        ):
            raise ValueError(
                "For Fused MoE layers, only ",
                f"{CompressionFormat.pack_quantized.value} ",
                "is supported for the following bits: ",
                f"{WNA16_SUPPORTED_BITS}",
            )

    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        # Will transpose the loaded weight along the
        # intermediate and hidden dim sizes. Will
        # shard for TP along the transposed dims
        extra_weight_attrs.update(
            {"is_transposed": True, "quant_method": self.strategy}
        )
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size // self.packed_factor,
                2 * intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_packed", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition // self.packed_factor,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_packed", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        w2_scales_size = intermediate_size_per_partition

        if self.strategy == "channel":
            num_groups_w2 = num_groups_w13 = 1
            self.group_size = -1
        else:
            num_groups_w2 = w2_scales_size // self.group_size
            num_groups_w13 = hidden_size // self.group_size

        w13_scale = torch.nn.Parameter(
            torch.ones(
                num_experts,
                num_groups_w13,
                2 * intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_scale", w13_scale)
        set_weight_attrs(w13_scale, extra_weight_attrs)

        w2_scale = torch.nn.Parameter(
            torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_scale", w2_scale)
        set_weight_attrs(w2_scale, extra_weight_attrs)
        set_weight_attrs(w2_scale, {"load_full_w2": False})

        w2_weight_shape = torch.nn.Parameter(
            torch.empty(num_experts, 2), requires_grad=False
        )
        layer.register_parameter("w2_weight_shape", w2_weight_shape)
        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
        w13_weight_shape = torch.nn.Parameter(
            torch.empty(num_experts, 2), requires_grad=False
        )

        layer.register_parameter("w13_weight_shape", w13_weight_shape)
        set_weight_attrs(w13_weight_shape, extra_weight_attrs)

        w13_g_idx = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
        set_weight_attrs(w13_g_idx, extra_weight_attrs)

        w2_g_idx = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
        set_weight_attrs(w2_g_idx, extra_weight_attrs)

        w13_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)

        w2_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty(
                num_experts,
                intermediate_size_per_partition,
                dtype=torch.int32,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)

        layer.a13_scale = None
        layer.a2_scale = None

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        # Reconfigure packed weights and scales to match moe_wna16 format
        layer.w13_weight_packed = torch.nn.Parameter(
            layer.w13_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
            requires_grad=False,
        )
        layer.w2_weight_packed = torch.nn.Parameter(
            layer.w2_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
            requires_grad=False,
        )
        layer.w13_weight_scale = torch.nn.Parameter(
            layer.w13_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
        )
        layer.w2_weight_scale = torch.nn.Parameter(
            layer.w2_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
        )

    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        assert self.num_bits == 4 or self.num_bits == 8
        config_builder = (
            int4_w4a16_moe_quant_config
            if self.num_bits == 4
            else int8_w8a16_moe_quant_config
        )

        return config_builder(
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
            w1_zp=None,
            w2_zp=None,
            block_shape=[0, self.group_size],
        )

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: int | None = None,
        num_expert_group: int | None = None,
        global_num_experts: int = -1,
        expert_map: torch.Tensor | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
        enable_eplb: bool = False,
        expert_load_view: torch.Tensor | None = None,
        logical_to_physical_map: torch.Tensor | None = None,
        logical_replica_count: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        if enable_eplb:
            if expert_load_view is None:
                raise ValueError("enable_eplb=True requiere expert_load_view != None")
            if logical_to_physical_map is None:
                raise ValueError(
                    "enable_eplb=True requiere logical_to_physical_map != None"
                )
            if logical_replica_count is None:
                raise ValueError(
                    "enable_eplb=True requiere logical_replica_count != None"
                )
            if not isinstance(layer, FusedMoE):
                raise TypeError(
                    "EPLB is only supported when `layer` is a instance of FusedMoE."
                )

        from vllm.model_executor.layers.fused_moe import fused_experts

        topk_weights, topk_ids, _ = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            routed_scaling_factor=routed_scaling_factor,
            e_score_correction_bias=e_score_correction_bias,
            indices_type=self.topk_indices_dtype,
            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
            enable_eplb=enable_eplb,
            expert_map=expert_map,
            expert_load_view=expert_load_view,
            logical_to_physical_map=logical_to_physical_map,
            logical_replica_count=logical_replica_count,
        )

        return fused_experts(
            x,
            layer.w13_weight_packed,
            layer.w2_weight_packed,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=True,
            activation=activation,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            quant_config=self.moe_quant_config,
        )

    @property
    def supports_eplb(self) -> bool:
        return True

group_size `instance-attribute` ¶

group_size = group_size

num_bits `instance-attribute` ¶

num_bits = num_bits

packed_factor `instance-attribute` ¶

packed_factor = 32 // num_bits

quant_config `instance-attribute` ¶

quant_config = quant_config

strategy `instance-attribute` ¶

strategy = strategy

supports_eplb `property` ¶

supports_eplb: bool

init ¶

__init__(
    quant_config: CompressedTensorsConfig,
    moe: FusedMoEConfig,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def __init__(
    self,
    quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
    moe: FusedMoEConfig,
):
    super().__init__(moe)
    self.quant_config = quant_config
    # TODO: @dsikka: refactor this to use schemes as other kernels
    # are supported + check if the layer is being ignored.
    config = self.quant_config.target_scheme_map["Linear"].get("weights")
    self.num_bits = config.num_bits
    self.packed_factor = 32 // config.num_bits
    self.strategy = config.strategy
    # channelwise is not supported by this kernel
    assert config.strategy == "group"
    self.group_size = config.group_size
    # grouped actorder isn't supported by this kernel
    assert config.actorder != "group"
    assert config.symmetric, "Only symmetric quantization is supported for MoE"

    if not (
        self.quant_config.quant_format == CompressionFormat.pack_quantized.value
        and self.num_bits in WNA16_SUPPORTED_BITS
    ):
        raise ValueError(
            "For Fused MoE layers, only ",
            f"{CompressionFormat.pack_quantized.value} ",
            "is supported for the following bits: ",
            f"{WNA16_SUPPORTED_BITS}",
        )

apply ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: Tensor | None = None,
    logical_to_physical_map: Tensor | None = None,
    logical_replica_count: Tensor | None = None,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: int | None = None,
    num_expert_group: int | None = None,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    custom_routing_function: Callable | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    expert_load_view: torch.Tensor | None = None,
    logical_to_physical_map: torch.Tensor | None = None,
    logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    if enable_eplb:
        if expert_load_view is None:
            raise ValueError("enable_eplb=True requiere expert_load_view != None")
        if logical_to_physical_map is None:
            raise ValueError(
                "enable_eplb=True requiere logical_to_physical_map != None"
            )
        if logical_replica_count is None:
            raise ValueError(
                "enable_eplb=True requiere logical_replica_count != None"
            )
        if not isinstance(layer, FusedMoE):
            raise TypeError(
                "EPLB is only supported when `layer` is a instance of FusedMoE."
            )

    from vllm.model_executor.layers.fused_moe import fused_experts

    topk_weights, topk_ids, _ = FusedMoE.select_experts(
        hidden_states=x,
        router_logits=router_logits,
        use_grouped_topk=use_grouped_topk,
        top_k=top_k,
        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        e_score_correction_bias=e_score_correction_bias,
        indices_type=self.topk_indices_dtype,
        num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
        enable_eplb=enable_eplb,
        expert_map=expert_map,
        expert_load_view=expert_load_view,
        logical_to_physical_map=logical_to_physical_map,
        logical_replica_count=logical_replica_count,
    )

    return fused_experts(
        x,
        layer.w13_weight_packed,
        layer.w2_weight_packed,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        inplace=True,
        activation=activation,
        apply_router_weight_on_input=apply_router_weight_on_input,
        global_num_experts=global_num_experts,
        expert_map=expert_map,
        quant_config=self.moe_quant_config,
    )

create_weights ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    # Will transpose the loaded weight along the
    # intermediate and hidden dim sizes. Will
    # shard for TP along the transposed dims
    extra_weight_attrs.update(
        {"is_transposed": True, "quant_method": self.strategy}
    )
    w13_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size // self.packed_factor,
            2 * intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_packed", w13_weight)
    set_weight_attrs(w13_weight, extra_weight_attrs)

    w2_weight = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition // self.packed_factor,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_packed", w2_weight)
    set_weight_attrs(w2_weight, extra_weight_attrs)

    w2_scales_size = intermediate_size_per_partition

    if self.strategy == "channel":
        num_groups_w2 = num_groups_w13 = 1
        self.group_size = -1
    else:
        num_groups_w2 = w2_scales_size // self.group_size
        num_groups_w13 = hidden_size // self.group_size

    w13_scale = torch.nn.Parameter(
        torch.ones(
            num_experts,
            num_groups_w13,
            2 * intermediate_size_per_partition,
            dtype=params_dtype,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_scale", w13_scale)
    set_weight_attrs(w13_scale, extra_weight_attrs)

    w2_scale = torch.nn.Parameter(
        torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_scale", w2_scale)
    set_weight_attrs(w2_scale, extra_weight_attrs)
    set_weight_attrs(w2_scale, {"load_full_w2": False})

    w2_weight_shape = torch.nn.Parameter(
        torch.empty(num_experts, 2), requires_grad=False
    )
    layer.register_parameter("w2_weight_shape", w2_weight_shape)
    set_weight_attrs(w2_weight_shape, extra_weight_attrs)
    w13_weight_shape = torch.nn.Parameter(
        torch.empty(num_experts, 2), requires_grad=False
    )

    layer.register_parameter("w13_weight_shape", w13_weight_shape)
    set_weight_attrs(w13_weight_shape, extra_weight_attrs)

    w13_g_idx = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_weight_g_idx", w13_g_idx)
    set_weight_attrs(w13_g_idx, extra_weight_attrs)

    w2_g_idx = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_weight_g_idx", w2_g_idx)
    set_weight_attrs(w2_g_idx, extra_weight_attrs)

    w13_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty(
            num_experts,
            hidden_size,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
    set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)

    w2_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty(
            num_experts,
            intermediate_size_per_partition,
            dtype=torch.int32,
        ),
        requires_grad=False,
    )
    layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
    set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)

    layer.a13_scale = None
    layer.a2_scale = None

get_fused_moe_quant_config ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    assert self.num_bits == 4 or self.num_bits == 8
    config_builder = (
        int4_w4a16_moe_quant_config
        if self.num_bits == 4
        else int8_w8a16_moe_quant_config
    )

    return config_builder(
        w1_scale=layer.w13_weight_scale,
        w2_scale=layer.w2_weight_scale,
        w1_zp=None,
        w2_zp=None,
        block_shape=[0, self.group_size],
    )

process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    # Reconfigure packed weights and scales to match moe_wna16 format
    layer.w13_weight_packed = torch.nn.Parameter(
        layer.w13_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
        requires_grad=False,
    )
    layer.w2_weight_packed = torch.nn.Parameter(
        layer.w2_weight_packed.transpose(1, 2).contiguous().view(torch.uint8),
        requires_grad=False,
    )
    layer.w13_weight_scale = torch.nn.Parameter(
        layer.w13_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
    )
    layer.w2_weight_scale = torch.nn.Parameter(
        layer.w2_weight_scale.transpose(1, 2).contiguous(), requires_grad=False
    )

GPTQMarlinState ¶

Bases: Enum

Source code in vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

class GPTQMarlinState(Enum):
    REPACK = enum.auto()
    READY = enum.auto()

READY `class-attribute` `instance-attribute` ¶

READY = auto()

REPACK `class-attribute` `instance-attribute` ¶

REPACK = auto()

vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe ¶

__all__ module-attribute ¶

logger module-attribute ¶

CompressedTensorsMoEMethod ¶

get_moe_method staticmethod ¶

CompressedTensorsW4A4MoeMethod ¶

allow_flashinfer instance-attribute ¶

cutlass_nvfp4_supported instance-attribute ¶

flashinfer_moe_backend instance-attribute ¶

group_size instance-attribute ¶

use_marlin instance-attribute ¶

__init__ ¶

apply ¶

create_weights ¶

get_fused_moe_quant_config ¶

maybe_make_prepare_finalize ¶

process_weights_after_loading ¶

select_gemm_impl ¶

CompressedTensorsW4A8Int8MoEMethod ¶

group_size instance-attribute ¶

has_bias instance-attribute ¶

quant_config instance-attribute ¶

static_input_scales instance-attribute ¶

__init__ ¶

apply ¶

create_weights ¶

get_fused_moe_quant_config ¶

process_weights_after_loading ¶

CompressedTensorsW8A8Fp8MoEMethod ¶

block_quant instance-attribute ¶

disable_expert_map instance-attribute ¶

input_quant instance-attribute ¶

is_fp8_w8a8_sm100 instance-attribute ¶

quant_config instance-attribute ¶

rocm_aiter_moe_enabled instance-attribute ¶

static_input_scales instance-attribute ¶

supports_eplb property ¶

use_cutlass instance-attribute ¶

use_marlin instance-attribute ¶

weight_block_size instance-attribute ¶

weight_quant instance-attribute ¶

__init__ ¶

apply ¶

create_weights ¶

get_fused_moe_quant_config ¶

maybe_make_prepare_finalize ¶

process_weights_after_loading ¶

select_gemm_impl ¶

CompressedTensorsW8A8Int8MoEMethod ¶

input_quant instance-attribute ¶

quant_config instance-attribute ¶

static_input_scales instance-attribute ¶

weight_quant instance-attribute ¶

__init__ ¶

apply ¶

create_weights ¶

get_fused_moe_quant_config ¶

process_weights_after_loading ¶

CompressedTensorsWNA16MarlinMoEMethod ¶

actorder instance-attribute ¶

group_size instance-attribute ¶

num_bits instance-attribute ¶

packed_factor instance-attribute ¶

quant_config instance-attribute ¶

quant_type instance-attribute ¶

strategy instance-attribute ¶

use_marlin instance-attribute ¶

__init__ ¶

apply ¶

create_weights ¶

get_fused_moe_quant_config ¶

process_weights_after_loading ¶

select_gemm_impl ¶

CompressedTensorsWNA16MoEMethod ¶

group_size instance-attribute ¶

num_bits instance-attribute ¶

packed_factor instance-attribute ¶

quant_config instance-attribute ¶

strategy instance-attribute ¶

supports_eplb property ¶

all `module-attribute` ¶

logger `module-attribute` ¶

get_moe_method `staticmethod` ¶

allow_flashinfer `instance-attribute` ¶

cutlass_nvfp4_supported `instance-attribute` ¶

flashinfer_moe_backend `instance-attribute` ¶

group_size `instance-attribute` ¶

use_marlin `instance-attribute` ¶

init ¶

group_size `instance-attribute` ¶

has_bias `instance-attribute` ¶

quant_config `instance-attribute` ¶

static_input_scales `instance-attribute` ¶

init ¶

block_quant `instance-attribute` ¶

disable_expert_map `instance-attribute` ¶

input_quant `instance-attribute` ¶

is_fp8_w8a8_sm100 `instance-attribute` ¶

quant_config `instance-attribute` ¶

rocm_aiter_moe_enabled `instance-attribute` ¶

static_input_scales `instance-attribute` ¶

supports_eplb `property` ¶

use_cutlass `instance-attribute` ¶

use_marlin `instance-attribute` ¶

weight_block_size `instance-attribute` ¶

weight_quant `instance-attribute` ¶

init ¶

input_quant `instance-attribute` ¶

quant_config `instance-attribute` ¶

static_input_scales `instance-attribute` ¶

weight_quant `instance-attribute` ¶

init ¶

actorder `instance-attribute` ¶

group_size `instance-attribute` ¶

num_bits `instance-attribute` ¶

packed_factor `instance-attribute` ¶

quant_config `instance-attribute` ¶

quant_type `instance-attribute` ¶

strategy `instance-attribute` ¶

use_marlin `instance-attribute` ¶

init ¶

group_size `instance-attribute` ¶

num_bits `instance-attribute` ¶

packed_factor `instance-attribute` ¶

quant_config `instance-attribute` ¶

strategy `instance-attribute` ¶

supports_eplb `property` ¶

init ¶

READY `class-attribute` `instance-attribute` ¶

REPACK `class-attribute` `instance-attribute` ¶