Skip to content

vllm.model_executor.layers.fused_moe.oracle.mxfp8

Functions:

_mxfp8_backend_to_kernel_cls(backend)

Resolve the MXFP8 expert classes for a backend.

DeepGEMM resolves directly to DeepGemmExperts (not the TritonOrDeepGemmExperts wrapper, whose Triton fallback cannot handle the MXFP8 1x32 scheme); all other backends defer to the FP8 resolver.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
def _mxfp8_backend_to_kernel_cls(
    backend: Fp8MoeBackend,
) -> list[type[mk.FusedMoEExperts]]:
    """Resolve the MXFP8 expert classes for a backend.

    DeepGEMM resolves directly to ``DeepGemmExperts`` (not the
    ``TritonOrDeepGemmExperts`` wrapper, whose Triton fallback cannot handle the
    MXFP8 1x32 scheme); all other backends defer to the FP8 resolver.
    """
    if backend == Fp8MoeBackend.DEEPGEMM:
        from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
            DeepGemmExperts,
        )

        return [DeepGemmExperts]
    return backend_to_kernel_cls(backend)

_select_kernel_cls(backend, config)

Select the first supported expert class for the MXFP8 config.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
def _select_kernel_cls(
    backend: Fp8MoeBackend,
    config: FusedMoEConfig,
) -> type[mk.FusedMoEExperts]:
    """Select the first supported expert class for the MXFP8 config."""
    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )
    last_reason: str | None = None
    for cls in _mxfp8_backend_to_kernel_cls(backend):
        supported, reason = cls.is_supported_config(
            cls,
            config,
            kMxfp8Static,
            kMxfp8Dynamic,
            activation_format,
        )
        if supported:
            return cls
        last_reason = reason
    raise ValueError(
        f"No supported MXFP8 expert class for {backend.value}: {last_reason}"
    )

_select_rocm_mxfp8_backend()

ROCm fallback when vendor MXFP8 backends are unavailable.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
def _select_rocm_mxfp8_backend() -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts]]:
    """ROCm fallback when vendor MXFP8 backends are unavailable."""

    if current_platform.supports_mx():
        from vllm.model_executor.layers.fused_moe.experts.mxfp8_native_moe import (
            Mxfp8NativeTritonExperts,
        )

        logger.info_once("Using native CDNA4 (gfx950) MXFP8 dot_scaled MoE backend.")
        return Fp8MoeBackend.NATIVE_MXFP8, Mxfp8NativeTritonExperts

    from vllm.model_executor.layers.fused_moe.experts.mxfp8_emulation_moe import (
        Mxfp8EmulationTritonExperts,
    )

    logger.info_once(
        "No native MXFP8 MoE backend available on this device; "
        "MXFP8 weights will be dequantized to BF16 once at load time and the "
        "MoE will run in BF16 (no per-step dequant)."
    )
    return Fp8MoeBackend.EMULATION, Mxfp8EmulationTritonExperts

select_mxfp8_moe_backend(config)

Select the MXFP8 MoE backend and the best expert class.

Returns:

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
def select_mxfp8_moe_backend(
    config: FusedMoEConfig,
) -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts]]:
    """Select the MXFP8 MoE backend and the best expert class.

    Returns:
        A tuple of (fp8_backend, experts_cls).
    """

    runner_backend = config.moe_backend
    if runner_backend != "auto":
        backend = _BACKEND_NAME_MAP.get(runner_backend)
        if backend is None:
            raise ValueError(
                f"moe_backend='{runner_backend}' is not supported for "
                f"MXFP8 MoE. Expected one of "
                f"{list(_BACKEND_NAME_MAP.keys())}."
            )
        logger.info_once(
            "Using '%s' MxFp8 MoE backend (user-requested).",
            backend.value,
        )
        return backend, _select_kernel_cls(backend, config)

    # Auto-select: pick the first supported backend.
    for backend in _SUPPORTED_BACKENDS:
        try:
            experts_cls = _select_kernel_cls(backend, config)
        except ValueError:
            continue
        logger.info_once("Using '%s' MxFp8 MoE backend.", backend.value)
        return backend, experts_cls

    # simplify the logic for rocm, refactor later when more backends are supported
    if current_platform.is_rocm():
        return _select_rocm_mxfp8_backend()

    raise ValueError("No MXFP8 MoE backends available.")