vllm.model_executor.layers.fused_moe.prepare_finalize

MoEPrepareAndFinalizeNoEP ¶

Bases: FusedMoEPrepareAndFinalize

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):

    @property
    def activation_format(self) -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    def max_num_tokens_per_rank(self) -> Optional[int]:
        return None

    def topk_indices_dtype(self) -> Optional[torch.dtype]:
        return None

    def num_dispatchers(self) -> int:
        return 1

    def prepare(
        self,
        a1: torch.Tensor,
        a1_scale: Optional[torch.Tensor],
        a2_scale: Optional[torch.Tensor],
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
        expert_map: Optional[torch.Tensor],
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
               Optional[torch.Tensor], Optional[torch.Tensor]]:

        if apply_router_weight_on_input:
            topk = topk_ids.size(1)
            # TODO: this only works for topK=1, will need to update for topK>1
            assert topk == 1, \
                "apply_router_weight_on_input is only implemented for topk=1"
            a1.mul_(topk_weights.to(a1.dtype))

        a1q, a1q_scale = moe_kernel_quantize_input(
            a1, a1_scale, quant_config.quant_dtype,
            quant_config.per_act_token_quant, quant_config.block_shape)

        return a1q, a1q_scale, None, None, None

    def finalize(
        self,
        output: torch.Tensor,
        fused_expert_output: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
    ) -> None:
        _moe_unpermute_and_reduce(output, fused_expert_output, None,
                                  topk_weights, apply_router_weight_on_input)

activation_format `property` ¶

activation_format: FusedMoEActivationFormat

finalize ¶

finalize(
    output: Tensor,
    fused_expert_output: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    apply_router_weight_on_input: bool,
) -> None

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

def finalize(
    self,
    output: torch.Tensor,
    fused_expert_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    apply_router_weight_on_input: bool,
) -> None:
    _moe_unpermute_and_reduce(output, fused_expert_output, None,
                              topk_weights, apply_router_weight_on_input)

max_num_tokens_per_rank ¶

max_num_tokens_per_rank() -> Optional[int]

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

def max_num_tokens_per_rank(self) -> Optional[int]:
    return None

num_dispatchers ¶

num_dispatchers() -> int

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

def num_dispatchers(self) -> int:
    return 1

prepare ¶

prepare(
    a1: Tensor,
    a1_scale: Optional[Tensor],
    a2_scale: Optional[Tensor],
    topk_weights: Tensor,
    topk_ids: Tensor,
    num_experts: int,
    expert_map: Optional[Tensor],
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
) -> tuple[
    Tensor,
    Optional[Tensor],
    Optional[Tensor],
    Optional[Tensor],
    Optional[Tensor],
]

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

def prepare(
    self,
    a1: torch.Tensor,
    a1_scale: Optional[torch.Tensor],
    a2_scale: Optional[torch.Tensor],
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    expert_map: Optional[torch.Tensor],
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
           Optional[torch.Tensor], Optional[torch.Tensor]]:

    if apply_router_weight_on_input:
        topk = topk_ids.size(1)
        # TODO: this only works for topK=1, will need to update for topK>1
        assert topk == 1, \
            "apply_router_weight_on_input is only implemented for topk=1"
        a1.mul_(topk_weights.to(a1.dtype))

    a1q, a1q_scale = moe_kernel_quantize_input(
        a1, a1_scale, quant_config.quant_dtype,
        quant_config.per_act_token_quant, quant_config.block_shape)

    return a1q, a1q_scale, None, None, None

topk_indices_dtype ¶

topk_indices_dtype() -> Optional[dtype]

Source code in vllm/model_executor/layers/fused_moe/prepare_finalize.py

def topk_indices_dtype(self) -> Optional[torch.dtype]:
    return None

vllm.model_executor.layers.fused_moe.prepare_finalize

MoEPrepareAndFinalizeNoEP ¶

activation_format property ¶

finalize ¶

max_num_tokens_per_rank ¶

num_dispatchers ¶

prepare ¶

topk_indices_dtype ¶

activation_format `property` ¶