vllm.model_executor.layers.fused_moe.moe_fused_mul_sum ¶

moe_fused_mul_sum ¶

moe_fused_mul_sum(
    inputs: Tensor,
    topk_weights: Tensor,
    outputs: Tensor | None = None,
    topk_ids: Tensor | None = None,
    expert_map: Tensor | None = None,
) -> Tensor

Fused kernel for MoE (Mixture of Experts) to perform weighted summation of expert outputs.

Parameters:

Name	Type	Description	Default
`inputs`	`Tensor`	The output from experts. Shape: (num_tokens, top_k, hidden_size).	required
`topk_weights`	`Tensor`	The weights assigned to each expert for each token. Shape: (num_tokens, top_k).	required
`outputs`	`Tensor \| None`	Optional pre-allocated output tensor. Shape: (num_tokens, hidden_size).	`None`
`topk_ids`	`Tensor \| None`	Optional indices of the top-k experts. Used when `expert_map` is provided. Shape: (num_tokens, top_k).	`None`
`expert_map`	`Tensor \| None`	Optional mapping for Expert Parallelism. A value < 0 indicates an invalid token/expert pair that will be skipped.	`None`

Returns:

Name	Type	Description
	`Tensor`	The fused weighted sum of expert outputs.
`Shape`	`Tensor`	(num_tokens, hidden_size).

Source code in vllm/model_executor/layers/fused_moe/moe_fused_mul_sum.py

def moe_fused_mul_sum(
    inputs: torch.Tensor,
    topk_weights: torch.Tensor,
    outputs: torch.Tensor | None = None,
    topk_ids: torch.Tensor | None = None,
    expert_map: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Fused kernel for MoE (Mixture of Experts) to perform weighted summation
    of expert outputs.

    Args:
        inputs: The output from experts.
            Shape: (num_tokens, top_k, hidden_size).
        topk_weights: The weights assigned to each expert for each token.
            Shape: (num_tokens, top_k).
        outputs: Optional pre-allocated output tensor.
            Shape: (num_tokens, hidden_size).
        topk_ids: Optional indices of the top-k experts. Used when
            `expert_map` is provided. Shape: (num_tokens, top_k).
        expert_map: Optional mapping for Expert Parallelism. A value < 0
            indicates an invalid token/expert pair that will be skipped.

    Returns:
        The fused weighted sum of expert outputs.
        Shape: (num_tokens, hidden_size).
    """
    assert inputs.ndim == 3
    assert topk_weights.ndim == 2
    assert inputs.is_contiguous()
    assert topk_weights.is_contiguous()
    assert inputs.dtype in (torch.float32, torch.float16, torch.bfloat16)
    assert topk_weights.dtype in (torch.float32, torch.float16, torch.bfloat16)

    num_tokens, top_k, size = inputs.shape
    output_shape = (num_tokens, size)
    if outputs is None:
        outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)

    assert outputs.shape == output_shape
    assert topk_weights.shape == (num_tokens, top_k)

    if not isinstance(inputs, FakeTensor):
        BLOCK_M, BLOCK_K, num_warps, num_stages = _heuristic_config(
            num_tokens,
            top_k,
            size,
            inputs.element_size(),
        )
        grid = (triton.cdiv(size, BLOCK_K), triton.cdiv(num_tokens, BLOCK_M))
        moe_fused_mul_sum_kernel[grid](
            inputs,
            topk_weights,
            outputs,
            topk_ids,
            expert_map,
            num_tokens,
            top_k * size,
            expert_map is not None,
            top_k,
            size,
            BLOCK_M,
            BLOCK_K,
            num_warps=num_warps,
            num_stages=num_stages,
        )

    return outputs