vllm.model_executor.layers.fused_moe.oracle.int_wna16 ¶

_get_priority_backends ¶

_get_priority_backends() -> list[WNA16MoEBackend]

Get available backends in priority order based on platform and config.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _get_priority_backends() -> list[WNA16MoEBackend]:
    """
    Get available backends in priority order based on platform and config.
    """
    if current_platform.is_xpu():
        return [WNA16MoEBackend.XPU]

    _AVAILABLE_BACKENDS = [
        WNA16MoEBackend.FLASHINFER_TRTLLM,
        WNA16MoEBackend.MARLIN,
        WNA16MoEBackend.BATCHED_MARLIN,
    ]
    return _AVAILABLE_BACKENDS

_process_awq_weights_marlin ¶

_process_awq_weights_marlin(
    layer: Module,
    weight_bits: int,
    pack_factor: int,
    group_size: int,
    input_dtype: dtype | None,
    w13_qweight: Tensor,
    w2_qweight: Tensor,
    w13_scales: Tensor,
    w2_scales: Tensor,
    w13_qzeros: Tensor,
    w2_qzeros: Tensor,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

AWQ-specific Marlin weight post-processing.

AWQ checkpoints use a different packing order than GPTQ, so they need AWQ-specific weight repacking and zero-point conversion before Marlin runs.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_awq_weights_marlin(
    layer: torch.nn.Module,
    weight_bits: int,
    pack_factor: int,
    group_size: int,
    input_dtype: torch.dtype | None,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_qzeros: torch.Tensor,
    w2_qzeros: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_g_idx
    torch.Tensor | None,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """AWQ-specific Marlin weight post-processing.

    AWQ checkpoints use a different packing order than GPTQ, so they need
    AWQ-specific weight repacking and zero-point conversion before Marlin runs.
    """
    num_experts = w13_qweight.shape[0]
    device = w13_qweight.device
    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
    w13_input_global_scale: torch.Tensor | None = None
    w2_input_global_scale: torch.Tensor | None = None
    w13_bias_out: torch.Tensor | None = None
    w2_bias_out: torch.Tensor | None = None

    if input_dtype == torch.float8_e4m3fn:
        ops.marlin_int4_fp8_preprocess(
            w13_qweight.view(-1, w13_qweight.size(2)),
            w13_qzeros.view(-1, w13_qzeros.size(2)),
            inplace=True,
        )
        ops.marlin_int4_fp8_preprocess(
            w2_qweight.view(-1, w2_qweight.size(2)),
            w2_qzeros.view(-1, w2_qzeros.size(2)),
            inplace=True,
        )
        w13_scales = w13_scales.data * 512
        w2_scales = w2_scales.data * 512

    w13_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty((num_experts, 0), dtype=torch.int32, device=device),
        requires_grad=False,
    )
    w2_g_idx_sort_indices = torch.nn.Parameter(
        torch.empty((num_experts, 0), dtype=torch.int32, device=device),
        requires_grad=False,
    )

    marlin_w13_qweight = ops.awq_marlin_moe_repack(
        w13_qweight,
        w13_g_idx_sort_indices,
        size_k=w13_qweight.shape[1],
        size_n=w13_qweight.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qweight = ops.awq_marlin_moe_repack(
        w2_qweight,
        w2_g_idx_sort_indices,
        size_k=w2_qweight.shape[1],
        size_n=w2_qweight.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )

    marlin_w13_scales = marlin_moe_permute_scales(
        s=w13_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=w13_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    if input_dtype == torch.int8 and layer.num_groups_w13 > 1:
        marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
            marlin_w13_scales
        )

    marlin_w2_scales = marlin_moe_permute_scales(
        s=w2_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=w2_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    if input_dtype == torch.int8 and layer.num_groups_w2 > 1:
        marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
            marlin_w2_scales
        )

    marlin_w13_qzeros = moe_awq_to_marlin_zero_points(
        w13_qzeros,
        size_k=w13_qzeros.shape[1],
        size_n=w13_qzeros.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qzeros = moe_awq_to_marlin_zero_points(
        w2_qzeros,
        size_k=w2_qzeros.shape[1],
        size_n=w2_qzeros.shape[2] * pack_factor,
        num_bits=weight_bits,
        is_a_8bit=is_a_8bit,
    )

    if w13_bias is not None:
        w13_bias_out = marlin_permute_bias(w13_bias)
    if w2_bias is not None:
        w2_bias_out = marlin_permute_bias(w2_bias)

    return (
        marlin_w13_qweight,
        marlin_w2_qweight,
        marlin_w13_scales,
        marlin_w2_scales,
        None,
        None,
        w13_g_idx_sort_indices,
        w2_g_idx_sort_indices,
        marlin_w13_qzeros,
        marlin_w2_qzeros,
        w13_input_global_scale,
        w2_input_global_scale,
        w13_bias_out,
        w2_bias_out,
    )

_process_weights_flashinfer ¶

_process_weights_flashinfer(
    w13_qweight: Tensor,
    w2_qweight: Tensor,
    w13_scales: Tensor,
    w2_scales: Tensor,
    w13_g_idx: Tensor,
    w2_g_idx: Tensor,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

Flashinfer (TRT-LLM MXINT4) weight post-processing.

Steps¶

Transform weights/scales via prepare_static_weights_for_trtllm_mxint4_moe.
Return transformed tensors, passing through g_idx/bias unchanged.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_flashinfer(
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_g_idx: torch.Tensor,
    w2_g_idx: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor,  # w13_g_idx
    torch.Tensor,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Flashinfer (TRT-LLM MXINT4) weight post-processing.

    Steps
    -----
    1. Transform weights/scales via ``prepare_static_weights_for_trtllm_mxint4_moe``.
    2. Return transformed tensors, passing through g_idx/bias unchanged.
    """
    from vllm.model_executor.layers.quantization.utils.flashinfer_mxint4_moe import (
        prepare_static_weights_for_trtllm_mxint4_moe,
    )

    dict_weights_mxint4 = prepare_static_weights_for_trtllm_mxint4_moe(
        w13_qweight,
        w13_scales,
        w2_qweight,
        w2_scales,
    )

    return (
        dict_weights_mxint4["gemm1_weights"],
        dict_weights_mxint4["gemm2_weights"],
        dict_weights_mxint4["gemm1_scales"],
        dict_weights_mxint4["gemm2_scales"],
        w13_g_idx,
        w2_g_idx,
        None,
        None,
        None,
        None,
        None,
        None,
        w13_bias,
        w2_bias,
    )

_process_weights_marlin ¶

_process_weights_marlin(
    layer: Module,
    input_dtype: dtype | None,
    num_bits: int,
    pack_factor: int,
    group_size: int,
    actorder: str | None,
    w13_qweight: Tensor,
    w2_qweight: Tensor,
    w13_scales: Tensor,
    w2_scales: Tensor,
    w13_g_idx: Tensor,
    w2_g_idx: Tensor,
    w13_qzeros: Tensor | None = None,
    w2_qzeros: Tensor | None = None,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

Standard Marlin weight post-processing shared by MARLIN and BATCHED_MARLIN backends.

Steps¶

Optional FP8 preprocessing of packed weights / scales.
Sort / reset g_idx tensors for act-order handling.
Repack weights via gptq_marlin_moe_repack.
Permute scales (and optionally extract INT8 global scales).
Permute bias tensors.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_marlin(
    layer: torch.nn.Module,
    input_dtype: torch.dtype | None,
    num_bits: int,
    pack_factor: int,
    group_size: int,
    actorder: str | None,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_g_idx: torch.Tensor,
    w2_g_idx: torch.Tensor,
    w13_qzeros: torch.Tensor | None = None,
    w2_qzeros: torch.Tensor | None = None,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor,  # w13_g_idx
    torch.Tensor,  # w2_g_idx
    torch.Tensor,  # w13_g_idx_sort_indices
    torch.Tensor,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Standard Marlin weight post-processing shared by MARLIN and
    BATCHED_MARLIN backends.

    Steps
    -----
    1. Optional FP8 preprocessing of packed weights / scales.
    2. Sort / reset g_idx tensors for act-order handling.
    3. Repack weights via ``gptq_marlin_moe_repack``.
    4. Permute scales (and optionally extract INT8 global scales).
    5. Permute bias tensors.
    """
    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1

    marlin_w13_qweight: torch.Tensor
    marlin_w2_qweight: torch.Tensor
    marlin_w13_scales: torch.Tensor
    marlin_w2_scales: torch.Tensor
    w13_g_idx_sort_indices: torch.Tensor | None = None
    w2_g_idx_sort_indices: torch.Tensor | None = None
    w13_input_global_scale: torch.Tensor | None = None
    w2_input_global_scale: torch.Tensor | None = None
    w13_bias_out: torch.Tensor | None = None
    w2_bias_out: torch.Tensor | None = None

    # --- FP8 weight / scale adjustment ---
    if input_dtype == torch.float8_e4m3fn:
        # NOTE: for non-zp quantization format only
        marlin_w13_qweight = ops.marlin_int4_fp8_preprocess(w13_qweight, inplace=False)
        marlin_w2_qweight = ops.marlin_int4_fp8_preprocess(w2_qweight, inplace=False)
        marlin_w13_scales = w13_scales.data * 512
        marlin_w2_scales = w2_scales.data * 512
    else:
        marlin_w13_qweight = w13_qweight
        marlin_w2_qweight = w2_qweight
        marlin_w13_scales = w13_scales
        marlin_w2_scales = w2_scales

    # --- Process act_order (g_idx) ---
    if actorder == "group":
        num_experts = w13_g_idx.shape[0]
        w13_g_idx_sort_indices = torch.empty_like(w13_g_idx)
        w2_g_idx_sort_indices = torch.empty_like(w2_g_idx)
        w13_sorted_g_idx = torch.empty_like(w13_g_idx)
        w2_sorted_g_idx = torch.empty_like(w2_g_idx)
        for e in range(num_experts):
            w13_g_idx_sort_indices[e] = torch.argsort(w13_g_idx[e]).to(torch.int32)
            w2_g_idx_sort_indices[e] = torch.argsort(w2_g_idx[e]).to(torch.int32)
            w13_sorted_g_idx[e] = w13_g_idx[e][w13_g_idx_sort_indices[e]]
            w2_sorted_g_idx[e] = w2_g_idx[e][w2_g_idx_sort_indices[e]]
        w13_g_idx = w13_sorted_g_idx
        w2_g_idx = w2_sorted_g_idx
    else:
        num_experts = w13_g_idx.shape[0]
        device = w13_g_idx.device
        w13_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w2_g_idx = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w13_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )
        w2_g_idx_sort_indices = torch.nn.Parameter(
            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
            requires_grad=False,
        )

    # --- Repack weights ---
    marlin_w13_qweight = ops.gptq_marlin_moe_repack(
        marlin_w13_qweight,
        w13_g_idx_sort_indices,
        marlin_w13_qweight.shape[1] * pack_factor,
        marlin_w13_qweight.shape[2],
        num_bits,
        is_a_8bit=is_a_8bit,
    )
    marlin_w2_qweight = ops.gptq_marlin_moe_repack(
        marlin_w2_qweight,
        w2_g_idx_sort_indices,
        marlin_w2_qweight.shape[1] * pack_factor,
        marlin_w2_qweight.shape[2],
        num_bits,
        is_a_8bit=is_a_8bit,
    )

    # --- Permute scales ---
    marlin_w13_scales = marlin_moe_permute_scales(
        s=marlin_w13_scales,
        size_k=layer.intermediate_size_per_partition,
        size_n=marlin_w13_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )
    group_size_or_pack_factor = group_size if group_size != -1 else pack_factor
    marlin_w2_scales = marlin_moe_permute_scales(
        s=marlin_w2_scales,
        size_k=marlin_w2_scales.shape[1] * group_size_or_pack_factor,
        size_n=marlin_w2_scales.shape[2],
        group_size=group_size,
        is_a_8bit=is_a_8bit,
    )

    if input_dtype == torch.int8:
        if layer.num_groups_w13 > 1:
            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
                marlin_w13_scales
            )
        if layer.num_groups_w2 > 1:
            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
                marlin_w2_scales
            )

    # --- Permute bias ---
    if w13_bias is not None:
        w13_bias_out = marlin_permute_bias(w13_bias)
    if w2_bias is not None:
        w2_bias_out = marlin_permute_bias(w2_bias)

    return (
        marlin_w13_qweight,
        marlin_w2_qweight,
        marlin_w13_scales,
        marlin_w2_scales,
        w13_g_idx,
        w2_g_idx,
        w13_g_idx_sort_indices,
        w2_g_idx_sort_indices,
        w13_qzeros,
        w2_qzeros,
        w13_input_global_scale,
        w2_input_global_scale,
        w13_bias_out,
        w2_bias_out,
    )

_process_weights_xpu ¶

_process_weights_xpu(
    layer: Module,
    quant_config: QuantizationConfig,
    w13_qweight: Tensor,
    w2_qweight: Tensor,
    w13_scales: Tensor,
    w2_scales: Tensor,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
]

Repack GPTQ-format INT4 MoE weights into the layout vllm_xpu_kernels.fused_moe_interface.xpu_fused_moe(is_int4=True) expects:

w13: [E, 2*N, K] int4 (uint8 storage [E, 2*N, K // 2])
w13_scales: [E, 2*N, K // group_size] params_dtype
w2:  [E, K, N]   int4 (uint8 storage [E, K, N // 2])
w2_scales:  [E, K, N // group_size]   params_dtype

Input GPTQ layout from FusedMoE.weight_loader: w13: [E, K // 8, 2N] int32 (8 nibbles per int32 along the input dim) w13_scales: [E, K // group_size, 2N] params_dtype w2: [E, N // 8, K] int32 w2_scales: [E, N // group_size, K] params_dtype

Transpose dim 1 ↔ dim 2 then view int32 → uint8 to recover sequential int4-packed bytes along the input dim. Each packed int32 holds 8 nibbles (n7<<28)|(n6<<24)|...|(n1<<4)|n0 in ascending K order; on a little-endian host the int32→uint8 view exposes them as bytes [n1<<4|n0, n3<<4|n2, n5<<4|n4, n7<<4|n6], i.e. two nibbles per byte with the lower nibble = lower input-K index. xpu_fused_moe(is_int4=True) expects this convention; on a big-endian host the byte order reverses and the kernel would silently miscompute, so we hard-fail.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def _process_weights_xpu(
    layer: torch.nn.Module,
    quant_config: QuantizationConfig,
    w13_qweight: torch.Tensor,
    w2_qweight: torch.Tensor,
    w13_scales: torch.Tensor,
    w2_scales: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Repack GPTQ-format INT4 MoE weights into the layout
    `vllm_xpu_kernels.fused_moe_interface.xpu_fused_moe(is_int4=True)` expects:

        w13: [E, 2*N, K] int4 (uint8 storage [E, 2*N, K // 2])
        w13_scales: [E, 2*N, K // group_size] params_dtype
        w2:  [E, K, N]   int4 (uint8 storage [E, K, N // 2])
        w2_scales:  [E, K, N // group_size]   params_dtype

    Input GPTQ layout from FusedMoE.weight_loader:
        w13: [E, K // 8, 2*N] int32 (8 nibbles per int32 along the input dim)
        w13_scales: [E, K // group_size, 2*N] params_dtype
        w2:  [E, N // 8, K] int32
        w2_scales:  [E, N // group_size, K] params_dtype

    Transpose dim 1 ↔ dim 2 then view int32 → uint8 to recover sequential
    int4-packed bytes along the input dim. Each packed int32 holds 8 nibbles
    `(n7<<28)|(n6<<24)|...|(n1<<4)|n0` in ascending K order; on a
    little-endian host the int32→uint8 view exposes them as bytes
    `[n1<<4|n0, n3<<4|n2, n5<<4|n4, n7<<4|n6]`, i.e. two nibbles per byte
    with the lower nibble = lower input-K index. xpu_fused_moe(is_int4=True)
    expects this convention; on a big-endian host the byte order reverses
    and the kernel would silently miscompute, so we hard-fail.
    """
    del layer, quant_config  # unused — kept for parity with the marlin helper

    if sys.byteorder != "little":
        raise NotImplementedError(
            "_process_weights_xpu requires a little-endian host: the GPTQ "
            "int32 → uint8 nibble repack relies on LE byte ordering."
        )

    w13_xpu = w13_qweight.transpose(1, 2).contiguous().view(torch.uint8)
    w2_xpu = w2_qweight.transpose(1, 2).contiguous().view(torch.uint8)
    w13_scales_xpu = w13_scales.transpose(1, 2).contiguous()
    w2_scales_xpu = w2_scales.transpose(1, 2).contiguous()

    return (
        w13_xpu,
        w2_xpu,
        w13_scales_xpu,
        w2_scales_xpu,
        w13_bias,
        w2_bias,
    )

backend_to_kernel_cls ¶

backend_to_kernel_cls(
    backend: WNA16MoEBackend,
) -> list[type[FusedMoEExperts]]

Return the experts class for the given backend, or None for NONE.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def backend_to_kernel_cls(
    backend: WNA16MoEBackend,
) -> list[type[mk.FusedMoEExperts]]:
    """Return the experts class for the given backend, or None for NONE."""
    if backend == WNA16MoEBackend.MARLIN:
        return [MarlinExperts]
    elif backend == WNA16MoEBackend.BATCHED_MARLIN:
        return [BatchedMarlinExperts]
    elif backend == WNA16MoEBackend.FLASHINFER_TRTLLM:
        return [TrtLlmMxint4ExpertsMonolithic]
    elif backend == WNA16MoEBackend.XPU:
        from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
            XPUExpertsWNA16,
        )

        return [XPUExpertsWNA16]
    else:
        raise ValueError(f"Unknown WNA16 MoE backend: {backend.value}")

convert_to_wna16_moe_kernel_format ¶

convert_to_wna16_moe_kernel_format(
    backend: WNA16MoEBackend,
    layer: Module,
    quant_config: QuantizationConfig
    | QuantizationArgs
    | None,
    input_dtype: dtype | None,
    w13: Tensor,
    w2: Tensor,
    w13_scale: Tensor,
    w2_scale: Tensor,
    w13_g_idx: Tensor | None = None,
    w2_g_idx: Tensor | None = None,
    w13_qzeros: Tensor | None = None,
    w2_qzeros: Tensor | None = None,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

Dispatch weight post-processing to the appropriate per-backend handler.

To add a new backend, implement a _process_weights_<name> helper and add a branch here.

Parameters:

Name	Type	Description	Default
`backend`	`WNA16MoEBackend`	the selected `WNA16MoEBackend`.	required
`layer`	`Module`	the `FusedMoE` layer whose parameters are being prepared.	required
`quant_config`	`QuantizationConfig \| QuantizationArgs \| None`	the `QuantizationConfig` for this layer.	required
`input_dtype`	`dtype \| None`	optional activation dtype, usually should be 16 bit.	required

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def convert_to_wna16_moe_kernel_format(
    backend: WNA16MoEBackend,
    layer: torch.nn.Module,
    quant_config: QuantizationConfig | QuantizationArgs | None,
    input_dtype: torch.dtype | None,
    w13: torch.Tensor,
    w2: torch.Tensor,
    w13_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    w13_g_idx: torch.Tensor | None = None,
    w2_g_idx: torch.Tensor | None = None,
    w13_qzeros: torch.Tensor | None = None,
    w2_qzeros: torch.Tensor | None = None,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
) -> tuple[
    torch.Tensor,  # w13_qweight
    torch.Tensor,  # w2_qweight
    torch.Tensor,  # w13_scales
    torch.Tensor,  # w2_scales
    torch.Tensor | None,  # w13_g_idx
    torch.Tensor | None,  # w2_g_idx
    torch.Tensor | None,  # w13_g_idx_sort_indices
    torch.Tensor | None,  # w2_g_idx_sort_indices
    torch.Tensor | None,  # w13_qzeros
    torch.Tensor | None,  # w2_qzeros
    torch.Tensor | None,  # w13_input_global_scale
    torch.Tensor | None,  # w2_input_global_scale
    torch.Tensor | None,  # w13_bias
    torch.Tensor | None,  # w2_bias
]:
    """Dispatch weight post-processing to the appropriate per-backend handler.

    To add a new backend, implement a ``_process_weights_<name>`` helper and
    add a branch here.

    Args:
        backend: the selected ``WNA16MoEBackend``.
        layer: the ``FusedMoE`` layer whose parameters are being prepared.
        quant_config: the ``QuantizationConfig`` for this layer.
        input_dtype: optional activation dtype, usually should be 16 bit.
    """
    if backend in (
        WNA16MoEBackend.MARLIN,
        WNA16MoEBackend.BATCHED_MARLIN,
    ):
        from vllm.model_executor.layers.quantization.auto_gptq import (
            AutoGPTQConfig,
        )
        from vllm.model_executor.layers.quantization.awq_marlin import (
            AWQMarlinConfig,
        )

        if isinstance(quant_config, AWQMarlinConfig):
            if w13_qzeros is None or w2_qzeros is None:
                raise ValueError("AWQ Marlin MoE requires zero-point tensors.")

            weight_bits = quant_config.weight_bits
            pack_factor = quant_config.pack_factor
            group_size = quant_config.group_size

            return _process_awq_weights_marlin(
                layer,
                weight_bits,
                pack_factor,
                group_size,
                input_dtype,
                w13,
                w2,
                w13_scale,
                w2_scale,
                w13_qzeros,
                w2_qzeros,
                w13_bias,
                w2_bias,
            )
        elif isinstance(quant_config, AutoGPTQConfig):
            num_bits = quant_config.quant_type.size_bits
            pack_factor = quant_config.pack_factor
            group_size = quant_config.group_size
            actorder = "group" if quant_config.desc_act else None
        elif isinstance(quant_config, QuantizationArgs):
            num_bits = quant_config.num_bits
            pack_factor = 32 // quant_config.num_bits
            group_size = quant_config.group_size
            actorder = quant_config.actorder
        else:
            raise TypeError(
                "Marlin WNA16 MoE backend requires AutoGPTQConfig, AWQMarlinConfig or "
                f"QuantizationArgs, got {type(quant_config).__name__}."
            )
        if w13_g_idx is None or w2_g_idx is None:
            raise ValueError("GPTQ Marlin MoE requires g_idx tensors.")
        return _process_weights_marlin(
            layer,
            input_dtype,
            num_bits,
            pack_factor,
            group_size,
            actorder,
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_g_idx,
            w2_g_idx,
            w13_qzeros,
            w2_qzeros,
            w13_bias,
            w2_bias,
        )
    elif backend == WNA16MoEBackend.FLASHINFER_TRTLLM:
        return _process_weights_flashinfer(
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_g_idx,
            w2_g_idx,
            w13_bias,
            w2_bias,
        )
    elif backend == WNA16MoEBackend.XPU:
        assert quant_config is not None
        (
            w13_xpu,
            w2_xpu,
            w13_scale_xpu,
            w2_scale_xpu,
            w13_bias_out,
            w2_bias_out,
        ) = _process_weights_xpu(
            layer,
            quant_config,
            w13,
            w2,
            w13_scale,
            w2_scale,
            w13_bias,
            w2_bias,
        )
        empty = torch.empty((0,), dtype=torch.int32, device=w13.device)
        return (
            w13_xpu,
            w2_xpu,
            w13_scale_xpu,
            w2_scale_xpu,
            empty,  # w13_g_idx
            empty,  # w2_g_idx
            empty,  # w13_g_idx_sort_indices
            empty,  # w2_g_idx_sort_indices
            None,  # w13_qzeros — sym int4 on XPU has none; kernel does uint4b8→s4
            None,  # w2_qzeros
            None,  # w13_input_global_scale
            None,  # w2_input_global_scale
            w13_bias_out,
            w2_bias_out,
        )
    else:
        raise ValueError(f"Unsupported wna16 MoE backend: {backend.value}")

make_wna16_moe_quant_config ¶

make_wna16_moe_quant_config(
    w1_scale: Tensor,
    w2_scale: Tensor,
    group_size: int,
    num_bits: int,
    w1_zp: Tensor | None = None,
    w2_zp: Tensor | None = None,
    w1_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
    a1_gscale: Tensor | None = None,
    a2_gscale: Tensor | None = None,
) -> FusedMoEQuantConfig

Create the FusedMoEQuantConfig for 4 or 8-bit WNA16 MoE.

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def make_wna16_moe_quant_config(
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    group_size: int,
    num_bits: int,
    w1_zp: torch.Tensor | None = None,
    w2_zp: torch.Tensor | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    a1_gscale: torch.Tensor | None = None,
    a2_gscale: torch.Tensor | None = None,
) -> FusedMoEQuantConfig:
    """Create the FusedMoEQuantConfig for 4 or 8-bit WNA16 MoE."""
    if num_bits == 4:
        return int4_w4a16_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            w1_zp=w1_zp,
            w2_zp=w2_zp,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            block_shape=[0, group_size],
            a1_gscale=a1_gscale,
            a2_gscale=a2_gscale,
        )
    else:
        assert num_bits == 8
        return int8_w8a16_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            w1_zp=w1_zp,
            w2_zp=w2_zp,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            block_shape=[0, group_size],
            a1_gscale=a1_gscale,
            a2_gscale=a2_gscale,
        )

select_wna16_moe_backend ¶

select_wna16_moe_backend(
    config: FusedMoEConfig, weight_key: QuantKey
) -> tuple[WNA16MoEBackend, type[FusedMoEExperts]]

Select the WNA16 MoE backend.

Parameters:

Name	Type	Description	Default
`config`	`FusedMoEConfig`	the shared `FusedMoEConfig` for this layer.	required
`weight_key`	`QuantKey`	The QuantKey describing the weight quantization. Must have int4 or int8 type.	required

Returns:

Type	Description
`tuple[WNA16MoEBackend, type[FusedMoEExperts]]`	A tuple of (`WNA16MoEBackend`, experts class or `None`).

Source code in vllm/model_executor/layers/fused_moe/oracle/int_wna16.py

def select_wna16_moe_backend(
    config: FusedMoEConfig,
    weight_key: QuantKey,
) -> tuple[WNA16MoEBackend, type[mk.FusedMoEExperts]]:
    """Select the WNA16 MoE backend.

    Args:
        config: the shared ``FusedMoEConfig`` for this layer.
        weight_key: The QuantKey describing the weight quantization.
                    Must have int4 or int8 type.

    Returns:
        A tuple of (``WNA16MoEBackend``, experts class or ``None``).
    """

    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    def _make_log_backend(backend: WNA16MoEBackend):
        return f"Using '{backend.value}' WNA16 MoE backend."

    def _make_log_unsupported(backend: WNA16MoEBackend, reason: str | None) -> str:
        if reason:
            return (
                f"WNA16 MoE backend '{backend.value}' does not support the "
                f"deployment configuration since {reason}."
            )
        return (
            f"WNA16 MoE backend '{backend.value}' does not support the "
            "deployment configuration."
        )

    def _return_or_raise(
        backend: WNA16MoEBackend,
        config: FusedMoEConfig,
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
        activation_format: mk.FusedMoEActivationFormat,
    ) -> tuple[WNA16MoEBackend, type[mk.FusedMoEExperts]]:
        reason: str | None = None
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend), scope="local")
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

    # Select kernels in order of backend.
    AVAILABLE_BACKENDS = _get_priority_backends()

    for backend in AVAILABLE_BACKENDS:
        activation_key = None  # always BF16 activation for WNA16 MoE
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend), scope="local")
                return backend, k_cls
            else:
                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")

    raise NotImplementedError(
        "No WNA16 MoE backend supports the deployment configuration."
    )