Skip to content

vllm.model_executor.models.hunyuan_v1_moe

Inference-only HunYuan model compatible with HuggingFace weights.

HunYuanAttention

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanAttention(nn.Module):

    def __init__(
        self,
        config: PretrainedConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        rope_theta: float = 10000,
        rope_scaling: Optional[dict[str, Any]] = None,
        max_position_embeddings: int = 8192,
        quant_config: Optional[QuantizationConfig] = None,
        bias: bool = False,
        cache_config: Optional[CacheConfig] = None,
        prefix: str = "",
        layer_id: int = -1,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
        assert self.total_num_heads % tp_size == 0
        self.num_heads = self.total_num_heads // tp_size
        self.total_num_kv_heads = num_kv_heads
        if self.total_num_kv_heads >= tp_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
        if hasattr(config, "head_dim"):
            self.head_dim = config.head_dim
        elif hasattr(config, "attention_head_dim"):
            self.head_dim = config.attention_head_dim
        else:
            self.head_dim = self.hidden_size // self.total_num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.use_qk_norm = getattr(config, "use_qk_norm", False)
        self.layer_id = layer_id

        self.qkv_proj = QKVParallelLinear(
            hidden_size=hidden_size,
            head_size=self.head_dim,
            total_num_heads=self.total_num_heads,
            total_num_kv_heads=self.total_num_kv_heads,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )

        self.o_proj = RowParallelLinear(
            input_size=self.total_num_heads * self.head_dim,
            output_size=hidden_size,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.o_proj",
        )

        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            base=rope_theta,
            rope_scaling=rope_scaling,
            is_neox_style=True,
        )
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )

        if self.use_qk_norm:
            self.query_layernorm = RMSNorm(self.head_dim,
                                           eps=config.rms_norm_eps)
            self.key_layernorm = RMSNorm(self.head_dim,
                                         eps=config.rms_norm_eps)

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        kv_states: Optional[tuple[torch.Tensor]] = None,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
        ori_k = k
        if self.use_qk_norm:
            q = self.query_layernorm(
                q.view(-1, self.num_heads, self.head_dim).contiguous())
            k = self.key_layernorm(
                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())

        attn_output = self.attn(q, k, v)
        # For o_proj
        attn_output = attn_output.view(q.shape[0], -1)
        output, _ = self.o_proj(attn_output)
        return output, (ori_k, v)

attn instance-attribute

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
)

head_dim instance-attribute

head_dim = head_dim

hidden_size instance-attribute

hidden_size = hidden_size

key_layernorm instance-attribute

key_layernorm = RMSNorm(head_dim, eps=rms_norm_eps)

kv_size instance-attribute

kv_size = num_kv_heads * head_dim

layer_id instance-attribute

layer_id = layer_id

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

num_heads instance-attribute

num_heads = total_num_heads // tp_size

num_kv_heads instance-attribute

num_kv_heads = max(1, total_num_kv_heads // tp_size)

o_proj instance-attribute

o_proj = RowParallelLinear(
    input_size=total_num_heads * head_dim,
    output_size=hidden_size,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.o_proj",
)

q_size instance-attribute

q_size = num_heads * head_dim

qkv_proj instance-attribute

qkv_proj = QKVParallelLinear(
    hidden_size=hidden_size,
    head_size=head_dim,
    total_num_heads=total_num_heads,
    total_num_kv_heads=total_num_kv_heads,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.qkv_proj",
)

query_layernorm instance-attribute

query_layernorm = RMSNorm(head_dim, eps=rms_norm_eps)

rope_theta instance-attribute

rope_theta = rope_theta

rotary_emb instance-attribute

rotary_emb = get_rope(
    head_dim,
    rotary_dim=head_dim,
    max_position=max_position_embeddings,
    base=rope_theta,
    rope_scaling=rope_scaling,
    is_neox_style=True,
)

scaling instance-attribute

scaling = head_dim ** -0.5

total_num_heads instance-attribute

total_num_heads = num_heads

total_num_kv_heads instance-attribute

total_num_kv_heads = num_kv_heads

use_qk_norm instance-attribute

use_qk_norm = getattr(config, 'use_qk_norm', False)

__init__

__init__(
    config: PretrainedConfig,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_theta: float = 10000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embeddings: int = 8192,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    cache_config: Optional[CacheConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(
    self,
    config: PretrainedConfig,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_theta: float = 10000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embeddings: int = 8192,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    cache_config: Optional[CacheConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None:
    super().__init__()
    self.hidden_size = hidden_size
    tp_size = get_tensor_model_parallel_world_size()
    self.total_num_heads = num_heads
    assert self.total_num_heads % tp_size == 0
    self.num_heads = self.total_num_heads // tp_size
    self.total_num_kv_heads = num_kv_heads
    if self.total_num_kv_heads >= tp_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_size % self.total_num_kv_heads == 0
    self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
    # MistralConfig has an optional head_dim introduced by Mistral-Nemo
    if hasattr(config, "head_dim"):
        self.head_dim = config.head_dim
    elif hasattr(config, "attention_head_dim"):
        self.head_dim = config.attention_head_dim
    else:
        self.head_dim = self.hidden_size // self.total_num_heads
    self.q_size = self.num_heads * self.head_dim
    self.kv_size = self.num_kv_heads * self.head_dim
    self.scaling = self.head_dim**-0.5
    self.rope_theta = rope_theta
    self.max_position_embeddings = max_position_embeddings
    self.use_qk_norm = getattr(config, "use_qk_norm", False)
    self.layer_id = layer_id

    self.qkv_proj = QKVParallelLinear(
        hidden_size=hidden_size,
        head_size=self.head_dim,
        total_num_heads=self.total_num_heads,
        total_num_kv_heads=self.total_num_kv_heads,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.qkv_proj",
    )

    self.o_proj = RowParallelLinear(
        input_size=self.total_num_heads * self.head_dim,
        output_size=hidden_size,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.o_proj",
    )

    self.rotary_emb = get_rope(
        self.head_dim,
        rotary_dim=self.head_dim,
        max_position=max_position_embeddings,
        base=rope_theta,
        rope_scaling=rope_scaling,
        is_neox_style=True,
    )
    self.attn = Attention(
        self.num_heads,
        self.head_dim,
        self.scaling,
        num_kv_heads=self.num_kv_heads,
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.attn",
    )

    if self.use_qk_norm:
        self.query_layernorm = RMSNorm(self.head_dim,
                                       eps=config.rms_norm_eps)
        self.key_layernorm = RMSNorm(self.head_dim,
                                     eps=config.rms_norm_eps)

forward

forward(
    positions: Tensor,
    hidden_states: Tensor,
    kv_states: Optional[tuple[Tensor]] = None,
) -> Tensor
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(
    self,
    positions: torch.Tensor,
    hidden_states: torch.Tensor,
    kv_states: Optional[tuple[torch.Tensor]] = None,
) -> torch.Tensor:
    qkv, _ = self.qkv_proj(hidden_states)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
    q, k = self.rotary_emb(positions, q, k)
    ori_k = k
    if self.use_qk_norm:
        q = self.query_layernorm(
            q.view(-1, self.num_heads, self.head_dim).contiguous())
        k = self.key_layernorm(
            k.view(-1, self.num_kv_heads, self.head_dim).contiguous())

    attn_output = self.attn(q, k, v)
    # For o_proj
    attn_output = attn_output.view(q.shape[0], -1)
    output, _ = self.o_proj(attn_output)
    return output, (ori_k, v)

HunYuanCrossAttention

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanCrossAttention(nn.Module):

    def __init__(
        self,
        config: PretrainedConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        rope_theta: float = 10000,
        rope_scaling: Optional[dict[str, Any]] = None,
        max_position_embeddings: int = 8192,
        quant_config: Optional[QuantizationConfig] = None,
        bias: bool = False,
        cache_config: Optional[CacheConfig] = None,
        prefix: str = "",
        layer_id: int = -1,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
        assert self.total_num_heads % tp_size == 0
        self.num_heads = self.total_num_heads // tp_size
        self.total_num_kv_heads = num_kv_heads
        if self.total_num_kv_heads >= tp_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
        if hasattr(config, "head_dim"):
            self.head_dim = config.head_dim
        elif hasattr(config, "attention_head_dim"):
            self.head_dim = config.attention_head_dim
        else:
            self.head_dim = self.hidden_size // self.total_num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.use_qk_norm = getattr(config, "use_qk_norm", False)
        self.layer_id = layer_id

        self.q_proj = ColumnParallelLinear(
            hidden_size,
            hidden_size,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.q_proj",
        )

        self.o_proj = RowParallelLinear(
            input_size=self.total_num_heads * self.head_dim,
            output_size=hidden_size,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.o_proj",
        )

        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            base=rope_theta,
            rope_scaling=rope_scaling,
            is_neox_style=True,
        )
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
            attn_type=AttentionType.ENCODER_DECODER,
        )

        if self.use_qk_norm:
            self.query_layernorm = RMSNorm(self.head_dim,
                                           eps=config.rms_norm_eps)
            self.key_layernorm = RMSNorm(self.head_dim,
                                         eps=config.rms_norm_eps)

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        kv_states: Optional[tuple[torch.Tensor]] = None,
    ) -> torch.Tensor:
        assert kv_states is not None
        ori_k, v = kv_states  # use last layer kv,
        k = ori_k
        q, _ = self.q_proj(hidden_states)
        k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
        q, _ = self.rotary_emb(positions, q, k_tmp)
        if self.use_qk_norm:
            q = self.query_layernorm(
                q.view(-1, self.num_heads, self.head_dim).contiguous())
            k = self.key_layernorm(
                k.view(-1, self.num_kv_heads, self.head_dim).contiguous())

        attn_output = self.attn(q, k, v)
        # For o_proj
        attn_output = attn_output.view(q.shape[0], -1)
        output, _ = self.o_proj(attn_output)
        return output, (ori_k, v)

attn instance-attribute

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    attn_type=ENCODER_DECODER,
)

head_dim instance-attribute

head_dim = head_dim

hidden_size instance-attribute

hidden_size = hidden_size

key_layernorm instance-attribute

key_layernorm = RMSNorm(head_dim, eps=rms_norm_eps)

kv_size instance-attribute

kv_size = num_kv_heads * head_dim

layer_id instance-attribute

layer_id = layer_id

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

num_heads instance-attribute

num_heads = total_num_heads // tp_size

num_kv_heads instance-attribute

num_kv_heads = max(1, total_num_kv_heads // tp_size)

o_proj instance-attribute

o_proj = RowParallelLinear(
    input_size=total_num_heads * head_dim,
    output_size=hidden_size,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.o_proj",
)

q_proj instance-attribute

q_proj = ColumnParallelLinear(
    hidden_size,
    hidden_size,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.q_proj",
)

q_size instance-attribute

q_size = num_heads * head_dim

query_layernorm instance-attribute

query_layernorm = RMSNorm(head_dim, eps=rms_norm_eps)

rope_theta instance-attribute

rope_theta = rope_theta

rotary_emb instance-attribute

rotary_emb = get_rope(
    head_dim,
    rotary_dim=head_dim,
    max_position=max_position_embeddings,
    base=rope_theta,
    rope_scaling=rope_scaling,
    is_neox_style=True,
)

scaling instance-attribute

scaling = head_dim ** -0.5

total_num_heads instance-attribute

total_num_heads = num_heads

total_num_kv_heads instance-attribute

total_num_kv_heads = num_kv_heads

use_qk_norm instance-attribute

use_qk_norm = getattr(config, 'use_qk_norm', False)

__init__

__init__(
    config: PretrainedConfig,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_theta: float = 10000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embeddings: int = 8192,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    cache_config: Optional[CacheConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(
    self,
    config: PretrainedConfig,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_theta: float = 10000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embeddings: int = 8192,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    cache_config: Optional[CacheConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None:
    super().__init__()
    self.hidden_size = hidden_size
    tp_size = get_tensor_model_parallel_world_size()
    self.total_num_heads = num_heads
    assert self.total_num_heads % tp_size == 0
    self.num_heads = self.total_num_heads // tp_size
    self.total_num_kv_heads = num_kv_heads
    if self.total_num_kv_heads >= tp_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_size % self.total_num_kv_heads == 0
    self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
    # MistralConfig has an optional head_dim introduced by Mistral-Nemo
    if hasattr(config, "head_dim"):
        self.head_dim = config.head_dim
    elif hasattr(config, "attention_head_dim"):
        self.head_dim = config.attention_head_dim
    else:
        self.head_dim = self.hidden_size // self.total_num_heads
    self.q_size = self.num_heads * self.head_dim
    self.kv_size = self.num_kv_heads * self.head_dim
    self.scaling = self.head_dim**-0.5
    self.rope_theta = rope_theta
    self.max_position_embeddings = max_position_embeddings
    self.use_qk_norm = getattr(config, "use_qk_norm", False)
    self.layer_id = layer_id

    self.q_proj = ColumnParallelLinear(
        hidden_size,
        hidden_size,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.q_proj",
    )

    self.o_proj = RowParallelLinear(
        input_size=self.total_num_heads * self.head_dim,
        output_size=hidden_size,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.o_proj",
    )

    self.rotary_emb = get_rope(
        self.head_dim,
        rotary_dim=self.head_dim,
        max_position=max_position_embeddings,
        base=rope_theta,
        rope_scaling=rope_scaling,
        is_neox_style=True,
    )
    self.attn = Attention(
        self.num_heads,
        self.head_dim,
        self.scaling,
        num_kv_heads=self.num_kv_heads,
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.attn",
        attn_type=AttentionType.ENCODER_DECODER,
    )

    if self.use_qk_norm:
        self.query_layernorm = RMSNorm(self.head_dim,
                                       eps=config.rms_norm_eps)
        self.key_layernorm = RMSNorm(self.head_dim,
                                     eps=config.rms_norm_eps)

forward

forward(
    positions: Tensor,
    hidden_states: Tensor,
    kv_states: Optional[tuple[Tensor]] = None,
) -> Tensor
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(
    self,
    positions: torch.Tensor,
    hidden_states: torch.Tensor,
    kv_states: Optional[tuple[torch.Tensor]] = None,
) -> torch.Tensor:
    assert kv_states is not None
    ori_k, v = kv_states  # use last layer kv,
    k = ori_k
    q, _ = self.q_proj(hidden_states)
    k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
    q, _ = self.rotary_emb(positions, q, k_tmp)
    if self.use_qk_norm:
        q = self.query_layernorm(
            q.view(-1, self.num_heads, self.head_dim).contiguous())
        k = self.key_layernorm(
            k.view(-1, self.num_kv_heads, self.head_dim).contiguous())

    attn_output = self.attn(q, k, v)
    # For o_proj
    attn_output = attn_output.view(q.shape[0], -1)
    output, _ = self.o_proj(attn_output)
    return output, (ori_k, v)

HunYuanDecoderLayer

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanDecoderLayer(nn.Module):

    def __init__(
        self,
        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
        layer_id: int = -1,
    ) -> None:
        super().__init__()
        assert layer_id >= 0
        self.layer_id = layer_id
        self.hidden_size = config.hidden_size
        self.intermediate_size = (config.intermediate_size if isinstance(
            config.intermediate_size, int) else
                                  config.intermediate_size[layer_id])
        rope_theta = getattr(config, "rope_theta", 10000)
        rope_scaling = getattr(config, "rope_scaling", None)
        if rope_scaling is not None and getattr(
                config, "original_max_position_embeddings", None):
            rope_scaling["original_max_position_embeddings"] = (
                config.original_max_position_embeddings)
        max_position_embeddings = getattr(config, "max_position_embeddings",
                                          8192)
        attention_bias = getattr(config, "attention_bias", False) or getattr(
            config, "bias", False)
        cla_factor = _get_cla_factor(config)
        attention_type = (AttentionType.ENCODER_DECODER
                          if layer_id >= 0 and layer_id % cla_factor != 0 else
                          AttentionType.DECODER)
        if attention_type == AttentionType.DECODER:
            self.self_attn = HunYuanAttention(
                config=config,
                hidden_size=self.hidden_size,
                num_heads=config.num_attention_heads,
                num_kv_heads=getattr(config, "num_key_value_heads",
                                     config.num_attention_heads),
                rope_theta=rope_theta,
                rope_scaling=rope_scaling,
                max_position_embeddings=max_position_embeddings,
                quant_config=quant_config,
                bias=attention_bias,
                cache_config=cache_config,
                prefix=f"{prefix}.self_attn",
                layer_id=layer_id,
            )
        elif attention_type == AttentionType.ENCODER_DECODER:
            self.self_attn = HunYuanCrossAttention(
                config=config,
                hidden_size=self.hidden_size,
                num_heads=config.num_attention_heads,
                num_kv_heads=getattr(config, "num_key_value_heads",
                                     config.num_attention_heads),
                rope_theta=rope_theta,
                rope_scaling=rope_scaling,
                max_position_embeddings=max_position_embeddings,
                quant_config=quant_config,
                bias=attention_bias,
                cache_config=cache_config,
                prefix=f"{prefix}.self_attn",
                layer_id=layer_id,
            )
        else:
            raise RuntimeError(f"Unsupported attention type: {attention_type}")

        self.mlp = HunYuanSparseMoeBlock(
            config=config,
            quant_config=quant_config,
            layer_id=layer_id,
            prefix=f"{prefix}.mlp",
        )
        self.input_layernorm = RMSNorm(config.hidden_size,
                                       eps=config.rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                eps=config.rms_norm_eps)

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
        kv_states: Optional[tuple[torch.Tensor]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Self Attention
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
            hidden_states, residual = self.input_layernorm(
                hidden_states, residual)
        hidden_states, ori_kv_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
            kv_states=kv_states,
        )

        # Fully Connected
        hidden_states, residual = self.post_attention_layernorm(
            hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        return hidden_states, residual, ori_kv_states

hidden_size instance-attribute

hidden_size = hidden_size

input_layernorm instance-attribute

input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)

intermediate_size instance-attribute

intermediate_size = (
    intermediate_size
    if isinstance(intermediate_size, int)
    else intermediate_size[layer_id]
)

layer_id instance-attribute

layer_id = layer_id

mlp instance-attribute

mlp = HunYuanSparseMoeBlock(
    config=config,
    quant_config=quant_config,
    layer_id=layer_id,
    prefix=f"{prefix}.mlp",
)

post_attention_layernorm instance-attribute

post_attention_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)

self_attn instance-attribute

self_attn = HunYuanAttention(
    config=config,
    hidden_size=hidden_size,
    num_heads=num_attention_heads,
    num_kv_heads=getattr(
        config, "num_key_value_heads", num_attention_heads
    ),
    rope_theta=rope_theta,
    rope_scaling=rope_scaling,
    max_position_embeddings=max_position_embeddings,
    quant_config=quant_config,
    bias=attention_bias,
    cache_config=cache_config,
    prefix=f"{prefix}.self_attn",
    layer_id=layer_id,
)

__init__

__init__(
    config: PretrainedConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(
    self,
    config: PretrainedConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
    layer_id: int = -1,
) -> None:
    super().__init__()
    assert layer_id >= 0
    self.layer_id = layer_id
    self.hidden_size = config.hidden_size
    self.intermediate_size = (config.intermediate_size if isinstance(
        config.intermediate_size, int) else
                              config.intermediate_size[layer_id])
    rope_theta = getattr(config, "rope_theta", 10000)
    rope_scaling = getattr(config, "rope_scaling", None)
    if rope_scaling is not None and getattr(
            config, "original_max_position_embeddings", None):
        rope_scaling["original_max_position_embeddings"] = (
            config.original_max_position_embeddings)
    max_position_embeddings = getattr(config, "max_position_embeddings",
                                      8192)
    attention_bias = getattr(config, "attention_bias", False) or getattr(
        config, "bias", False)
    cla_factor = _get_cla_factor(config)
    attention_type = (AttentionType.ENCODER_DECODER
                      if layer_id >= 0 and layer_id % cla_factor != 0 else
                      AttentionType.DECODER)
    if attention_type == AttentionType.DECODER:
        self.self_attn = HunYuanAttention(
            config=config,
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            num_kv_heads=getattr(config, "num_key_value_heads",
                                 config.num_attention_heads),
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            max_position_embeddings=max_position_embeddings,
            quant_config=quant_config,
            bias=attention_bias,
            cache_config=cache_config,
            prefix=f"{prefix}.self_attn",
            layer_id=layer_id,
        )
    elif attention_type == AttentionType.ENCODER_DECODER:
        self.self_attn = HunYuanCrossAttention(
            config=config,
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            num_kv_heads=getattr(config, "num_key_value_heads",
                                 config.num_attention_heads),
            rope_theta=rope_theta,
            rope_scaling=rope_scaling,
            max_position_embeddings=max_position_embeddings,
            quant_config=quant_config,
            bias=attention_bias,
            cache_config=cache_config,
            prefix=f"{prefix}.self_attn",
            layer_id=layer_id,
        )
    else:
        raise RuntimeError(f"Unsupported attention type: {attention_type}")

    self.mlp = HunYuanSparseMoeBlock(
        config=config,
        quant_config=quant_config,
        layer_id=layer_id,
        prefix=f"{prefix}.mlp",
    )
    self.input_layernorm = RMSNorm(config.hidden_size,
                                   eps=config.rms_norm_eps)
    self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                            eps=config.rms_norm_eps)

forward

forward(
    positions: Tensor,
    hidden_states: Tensor,
    residual: Optional[Tensor],
    kv_states: Optional[tuple[Tensor]] = None,
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(
    self,
    positions: torch.Tensor,
    hidden_states: torch.Tensor,
    residual: Optional[torch.Tensor],
    kv_states: Optional[tuple[torch.Tensor]] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    # Self Attention
    if residual is None:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
    else:
        hidden_states, residual = self.input_layernorm(
            hidden_states, residual)
    hidden_states, ori_kv_states = self.self_attn(
        positions=positions,
        hidden_states=hidden_states,
        kv_states=kv_states,
    )

    # Fully Connected
    hidden_states, residual = self.post_attention_layernorm(
        hidden_states, residual)
    hidden_states = self.mlp(hidden_states)
    return hidden_states, residual, ori_kv_states

HunYuanMLP

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanMLP(nn.Module):

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
        quant_config: Optional[QuantizationConfig] = None,
        bias: bool = False,
        prefix: str = "",
        reduce_results: bool = True,
    ) -> None:
        super().__init__()
        self.gate_up_proj = MergedColumnParallelLinear(
            input_size=hidden_size,
            output_sizes=[intermediate_size] * 2,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.gate_up_proj",
        )
        self.down_proj = RowParallelLinear(
            input_size=intermediate_size,
            output_size=hidden_size,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.down_proj",
            reduce_results=reduce_results,
        )
        if hidden_act != "silu":
            raise ValueError(f"Unsupported activation: {hidden_act}. "
                             "Only silu is supported for now.")
        self.act_fn = SiluAndMul()

    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
        return x

act_fn instance-attribute

act_fn = SiluAndMul()

down_proj instance-attribute

down_proj = RowParallelLinear(
    input_size=intermediate_size,
    output_size=hidden_size,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.down_proj",
    reduce_results=reduce_results,
)

gate_up_proj instance-attribute

gate_up_proj = MergedColumnParallelLinear(
    input_size=hidden_size,
    output_sizes=[intermediate_size] * 2,
    bias=bias,
    quant_config=quant_config,
    prefix=f"{prefix}.gate_up_proj",
)

__init__

__init__(
    hidden_size: int,
    intermediate_size: int,
    hidden_act: str,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    prefix: str = "",
    reduce_results: bool = True,
) -> None
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(
    self,
    hidden_size: int,
    intermediate_size: int,
    hidden_act: str,
    quant_config: Optional[QuantizationConfig] = None,
    bias: bool = False,
    prefix: str = "",
    reduce_results: bool = True,
) -> None:
    super().__init__()
    self.gate_up_proj = MergedColumnParallelLinear(
        input_size=hidden_size,
        output_sizes=[intermediate_size] * 2,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.gate_up_proj",
    )
    self.down_proj = RowParallelLinear(
        input_size=intermediate_size,
        output_size=hidden_size,
        bias=bias,
        quant_config=quant_config,
        prefix=f"{prefix}.down_proj",
        reduce_results=reduce_results,
    )
    if hidden_act != "silu":
        raise ValueError(f"Unsupported activation: {hidden_act}. "
                         "Only silu is supported for now.")
    self.act_fn = SiluAndMul()

forward

forward(x)
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(self, x):
    gate_up, _ = self.gate_up_proj(x)
    x = self.act_fn(gate_up)
    x, _ = self.down_proj(x)
    return x

HunYuanMoEV1ForCausalLM

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanMoEV1ForCausalLM(nn.Module):
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    }

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config
        self.config = config
        self.quant_config = quant_config
        self.lora_config = lora_config

        self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
        if get_pp_group().is_last_rank:
            self.unpadded_vocab_size = config.vocab_size
            if lora_config:
                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
            self.lm_head = ParallelLMHead(
                self.unpadded_vocab_size,
                config.hidden_size,
                org_num_embeddings=config.vocab_size,
                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                quant_config=quant_config,
            )
            if config.tie_word_embeddings:
                self.lm_head.weight = self.model.embed_tokens.weight

            logit_scale = getattr(config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                    config.vocab_size,
                                                    logit_scale)
            self.sampler = get_sampler()
        else:
            self.lm_head = PPMissingLayer()

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        model_output = self.model(input_ids, positions, intermediate_tensors,
                                  inputs_embeds)
        return model_output

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def sample(
        self,
        logits: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[SamplerOutput]:
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens

    def make_empty_intermediate_tensors(
            self, batch_size: int, dtype: torch.dtype,
            device: torch.device) -> IntermediateTensors:
        return IntermediateTensors({
            "hidden_states":
            torch.zeros((batch_size, self.config.hidden_size),
                        dtype=dtype,
                        device=device),
            "residual":
            torch.zeros((batch_size, self.config.hidden_size),
                        dtype=dtype,
                        device=device),
        })

    def _split_qkv_weight(self, qkv: torch.Tensor):
        num_attention_heads = self.config.num_attention_heads
        num_kv_heads = getattr(self.config, "num_key_value_heads",
                               self.config.num_attention_heads)
        num_key_value_groups = num_attention_heads // num_kv_heads
        hidden_size = self.config.hidden_size

        if hasattr(self.config, "head_dim"):
            attention_head_dim = self.config.head_dim
        elif hasattr(self.config, "attention_head_dim"):
            attention_head_dim = self.config.attention_head_dim
        else:
            attention_head_dim = self.config.hidden_size // num_attention_heads

        qkv = qkv.reshape(num_kv_heads, num_key_value_groups + 2,
                          attention_head_dim, hidden_size)
        q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
        q = q.reshape(-1, hidden_size)
        k = k.reshape(-1, hidden_size)
        v = v.reshape(-1, hidden_size)
        return torch.concat((q, k, v))

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        cla_factor = _get_cla_factor(self.config)
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            (".qkv_proj", ".q_proj", "q"),
            (".qkv_proj", ".k_proj", "k"),
            (".qkv_proj", ".v_proj", "v"),
            (".gate_up_proj", ".gate_proj", 0),
            (".gate_up_proj", ".up_proj", 1),
        ]

        num_attention_heads = self.config.num_attention_heads
        num_kv_heads = getattr(self.config, "num_key_value_heads",
                               self.config.num_attention_heads)
        split_params_mapping = [
            (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
            (
                ".qkv_proj",
                ".qkv_proj",
                num_attention_heads + num_kv_heads * 2,
                [("q", num_attention_heads), ("k", num_kv_heads),
                 ("v", num_kv_heads)],
                self._split_qkv_weight,
            ),
        ]

        # Params for weights, fp8 weight scales, fp8 activation scales
        # (param_name, weight_name, expert_id, shard_id)
        expert_params_mapping = FusedMoE.make_expert_params_mapping(
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",
            num_experts=self.config.num_experts,
        )

        params_dict = dict(self.named_parameters())
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name:
                continue
            if "gate_proj_bias" in name:
                name = name.replace("gate_proj_bias", "gate_proj.bias")
            if "up_proj_bias" in name:
                name = name.replace("up_proj_bias", "up_proj.bias")
            if ("rotary_emb.cos_cached" in name
                    or "rotary_emb.sin_cached" in name):
                # Models trained using ColossalAI may include these tensors in
                # the checkpoint. Skip them.
                continue
            # With tie_word_embeddings, we can skip lm_head.weight
            # The weight might appear unnecessarily in the files if the model is
            # processed with quantization, LoRA, fine-tuning, etc.
            if self.config.tie_word_embeddings and "lm_head.weight" in name:
                continue
            if self.quant_config is not None and (
                    scale_name := self.quant_config.get_cache_scale(name)):
                # Loading kv cache scales for compressed-tensors quantization
                param = params_dict[scale_name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                loaded_weight = loaded_weight[0]
                weight_loader(param, loaded_weight)
                continue

            is_found = False
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                if "mlp.experts" in name:
                    continue
                # cross layer only have q_proj, skip qkv pack
                if weight_name == ".q_proj":
                    match = re.search(r"layers\.\d+", name)
                    if match:
                        layer_id = int(match.group(0).split(".")[-1])
                        if cla_factor > 1 and layer_id % cla_factor != 0:
                            continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue

                if is_pp_missing_parameter(name, self):
                    continue

                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)

                is_found = True
                break
            if is_found:
                continue

            for (
                    param_name,
                    weight_name,
                    den,
                    split_param,
                    func,
            ) in split_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue

                if is_pp_missing_parameter(name, self):
                    continue

                assert loaded_weight.shape[0] % den == 0
                units = loaded_weight.shape[0] // den

                param = params_dict[name]
                weight_loader = param.weight_loader
                offset = 0
                for shard_id, num in split_param:
                    new_offset = offset + num * units
                    if func:
                        weight_loader(param,
                                      func(loaded_weight)[offset:new_offset],
                                      shard_id)
                    else:
                        weight_loader(param, loaded_weight[offset:new_offset],
                                      shard_id)
                    offset = new_offset

                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                for mapping in expert_params_mapping:
                    param_name, weight_name, expert_id, shard_id = mapping
                    if weight_name not in name:
                        continue
                    name = name.replace(weight_name, param_name)
                    # Skip layers on other devices.
                    if is_pp_missing_parameter(name, self):
                        continue
                    param = params_dict[name]
                    weight_loader = param.weight_loader
                    weight_loader(
                        param,
                        loaded_weight,
                        name,
                        shard_id=shard_id,
                        expert_id=expert_id,
                    )
                    break
                else:
                    # Remapping the name of FP8 kv-scale.
                    name = maybe_remap_kv_scale_name(name, params_dict)
                    if name is None:
                        continue

                    if is_pp_missing_parameter(name, self):
                        continue

                    if "mlp.gate.wg." in name:
                        name = name.replace("wg.", "")

                    param = params_dict[name]
                    weight_loader = getattr(param, "weight_loader",
                                            default_weight_loader)
                    weight_loader(param, loaded_weight)

config instance-attribute

config = config

lm_head instance-attribute

lm_head = ParallelLMHead(
    unpadded_vocab_size,
    hidden_size,
    org_num_embeddings=vocab_size,
    padding_size=DEFAULT_VOCAB_PADDING_SIZE,
    quant_config=quant_config,
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size, logit_scale
)

lora_config instance-attribute

lora_config = lora_config

model instance-attribute

model = HunYuanModel(
    vllm_config=vllm_config, prefix="model"
)

packed_modules_mapping class-attribute instance-attribute

packed_modules_mapping = {
    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
    "gate_up_proj": ["gate_proj", "up_proj"],
}

quant_config instance-attribute

quant_config = quant_config

sampler instance-attribute

sampler = get_sampler()

unpadded_vocab_size instance-attribute

unpadded_vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    quant_config = vllm_config.quant_config
    lora_config = vllm_config.lora_config
    self.config = config
    self.quant_config = quant_config
    self.lora_config = lora_config

    self.model = HunYuanModel(vllm_config=vllm_config, prefix="model")
    if get_pp_group().is_last_rank:
        self.unpadded_vocab_size = config.vocab_size
        if lora_config:
            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
        self.lm_head = ParallelLMHead(
            self.unpadded_vocab_size,
            config.hidden_size,
            org_num_embeddings=config.vocab_size,
            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
            quant_config=quant_config,
        )
        if config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight

        logit_scale = getattr(config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size,
                                                logit_scale)
        self.sampler = get_sampler()
    else:
        self.lm_head = PPMissingLayer()

_split_qkv_weight

_split_qkv_weight(qkv: Tensor)
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def _split_qkv_weight(self, qkv: torch.Tensor):
    num_attention_heads = self.config.num_attention_heads
    num_kv_heads = getattr(self.config, "num_key_value_heads",
                           self.config.num_attention_heads)
    num_key_value_groups = num_attention_heads // num_kv_heads
    hidden_size = self.config.hidden_size

    if hasattr(self.config, "head_dim"):
        attention_head_dim = self.config.head_dim
    elif hasattr(self.config, "attention_head_dim"):
        attention_head_dim = self.config.attention_head_dim
    else:
        attention_head_dim = self.config.hidden_size // num_attention_heads

    qkv = qkv.reshape(num_kv_heads, num_key_value_groups + 2,
                      attention_head_dim, hidden_size)
    q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
    q = q.reshape(-1, hidden_size)
    k = k.reshape(-1, hidden_size)
    v = v.reshape(-1, hidden_size)
    return torch.concat((q, k, v))

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    model_output = self.model(input_ids, positions, intermediate_tensors,
                              inputs_embeds)
    return model_output

load_weights

load_weights(weights: Iterable[tuple[str, Tensor]])
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
    cla_factor = _get_cla_factor(self.config)
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        (".qkv_proj", ".q_proj", "q"),
        (".qkv_proj", ".k_proj", "k"),
        (".qkv_proj", ".v_proj", "v"),
        (".gate_up_proj", ".gate_proj", 0),
        (".gate_up_proj", ".up_proj", 1),
    ]

    num_attention_heads = self.config.num_attention_heads
    num_kv_heads = getattr(self.config, "num_key_value_heads",
                           self.config.num_attention_heads)
    split_params_mapping = [
        (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
        (
            ".qkv_proj",
            ".qkv_proj",
            num_attention_heads + num_kv_heads * 2,
            [("q", num_attention_heads), ("k", num_kv_heads),
             ("v", num_kv_heads)],
            self._split_qkv_weight,
        ),
    ]

    # Params for weights, fp8 weight scales, fp8 activation scales
    # (param_name, weight_name, expert_id, shard_id)
    expert_params_mapping = FusedMoE.make_expert_params_mapping(
        ckpt_gate_proj_name="gate_proj",
        ckpt_down_proj_name="down_proj",
        ckpt_up_proj_name="up_proj",
        num_experts=self.config.num_experts,
    )

    params_dict = dict(self.named_parameters())
    for name, loaded_weight in weights:
        if "rotary_emb.inv_freq" in name:
            continue
        if "gate_proj_bias" in name:
            name = name.replace("gate_proj_bias", "gate_proj.bias")
        if "up_proj_bias" in name:
            name = name.replace("up_proj_bias", "up_proj.bias")
        if ("rotary_emb.cos_cached" in name
                or "rotary_emb.sin_cached" in name):
            # Models trained using ColossalAI may include these tensors in
            # the checkpoint. Skip them.
            continue
        # With tie_word_embeddings, we can skip lm_head.weight
        # The weight might appear unnecessarily in the files if the model is
        # processed with quantization, LoRA, fine-tuning, etc.
        if self.config.tie_word_embeddings and "lm_head.weight" in name:
            continue
        if self.quant_config is not None and (
                scale_name := self.quant_config.get_cache_scale(name)):
            # Loading kv cache scales for compressed-tensors quantization
            param = params_dict[scale_name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            loaded_weight = loaded_weight[0]
            weight_loader(param, loaded_weight)
            continue

        is_found = False
        for param_name, weight_name, shard_id in stacked_params_mapping:
            if weight_name not in name:
                continue
            if "mlp.experts" in name:
                continue
            # cross layer only have q_proj, skip qkv pack
            if weight_name == ".q_proj":
                match = re.search(r"layers\.\d+", name)
                if match:
                    layer_id = int(match.group(0).split(".")[-1])
                    if cla_factor > 1 and layer_id % cla_factor != 0:
                        continue
            name = name.replace(weight_name, param_name)
            # Skip loading extra bias for GPTQ models.
            if name.endswith(".bias") and name not in params_dict:
                continue

            if is_pp_missing_parameter(name, self):
                continue

            param = params_dict[name]
            weight_loader = param.weight_loader
            weight_loader(param, loaded_weight, shard_id)

            is_found = True
            break
        if is_found:
            continue

        for (
                param_name,
                weight_name,
                den,
                split_param,
                func,
        ) in split_params_mapping:
            if weight_name not in name:
                continue
            name = name.replace(weight_name, param_name)
            # Skip loading extra bias for GPTQ models.
            if name.endswith(".bias") and name not in params_dict:
                continue

            if is_pp_missing_parameter(name, self):
                continue

            assert loaded_weight.shape[0] % den == 0
            units = loaded_weight.shape[0] // den

            param = params_dict[name]
            weight_loader = param.weight_loader
            offset = 0
            for shard_id, num in split_param:
                new_offset = offset + num * units
                if func:
                    weight_loader(param,
                                  func(loaded_weight)[offset:new_offset],
                                  shard_id)
                else:
                    weight_loader(param, loaded_weight[offset:new_offset],
                                  shard_id)
                offset = new_offset

            break
        else:
            # Skip loading extra bias for GPTQ models.
            if name.endswith(".bias") and name not in params_dict:
                continue
            for mapping in expert_params_mapping:
                param_name, weight_name, expert_id, shard_id = mapping
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip layers on other devices.
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(
                    param,
                    loaded_weight,
                    name,
                    shard_id=shard_id,
                    expert_id=expert_id,
                )
                break
            else:
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue

                if is_pp_missing_parameter(name, self):
                    continue

                if "mlp.gate.wg." in name:
                    name = name.replace("wg.", "")

                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)

make_empty_intermediate_tensors

make_empty_intermediate_tensors(
    batch_size: int, dtype: dtype, device: device
) -> IntermediateTensors
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def make_empty_intermediate_tensors(
        self, batch_size: int, dtype: torch.dtype,
        device: torch.device) -> IntermediateTensors:
    return IntermediateTensors({
        "hidden_states":
        torch.zeros((batch_size, self.config.hidden_size),
                    dtype=dtype,
                    device=device),
        "residual":
        torch.zeros((batch_size, self.config.hidden_size),
                    dtype=dtype,
                    device=device),
    })

sample

sample(
    logits: Tensor, sampling_metadata: SamplingMetadata
) -> Optional[SamplerOutput]
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def sample(
    self,
    logits: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
    next_tokens = self.sampler(logits, sampling_metadata)
    return next_tokens

HunYuanModel

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
@support_torch_compile
class HunYuanModel(nn.Module):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config

        self.config = config
        self.quant_config = quant_config
        self.padding_idx = config.pad_token_id
        lora_vocab = ((lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0)
        self.vocab_size = config.vocab_size + lora_vocab
        self.org_vocab_size = config.vocab_size
        if get_pp_group().is_first_rank or (config.tie_word_embeddings
                                            and get_pp_group().is_last_rank):
            self.embed_tokens = VocabParallelEmbedding(
                self.vocab_size,
                config.hidden_size,
                org_num_embeddings=config.vocab_size,
                quant_config=quant_config,
            )
        else:
            self.embed_tokens = PPMissingLayer()
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.num_hidden_layers,
            lambda prefix: HunYuanDecoderLayer(
                config=config,
                layer_id=int(prefix.split(".")[-1]),
                cache_config=cache_config,
                quant_config=quant_config,
                prefix=prefix,
            ),
            prefix=f"{prefix}.layers",
        )
        if get_pp_group().is_last_rank:
            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        else:
            self.norm = PPMissingLayer()

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors],
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                hidden_states = self.get_input_embeddings(input_ids)
            residual = None
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]
            residual = intermediate_tensors["residual"]

        cla_factor = _get_cla_factor(self.config)
        prev_kv_states = None
        for i in range(self.start_layer, self.end_layer):
            layer = self.layers[i]
            hidden_states, residual, kv_states = layer(
                positions,
                hidden_states,
                residual,
                prev_kv_states,
            )

            if (getattr(self.config, "use_cla", False)
                    and (i - self.start_layer) % cla_factor == 0):
                prev_kv_states = kv_states
            else:
                prev_kv_states = None

        if not get_pp_group().is_last_rank:
            return IntermediateTensors({
                "hidden_states": hidden_states,
                "residual": residual
            })

        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states

config instance-attribute

config = config

embed_tokens instance-attribute

embed_tokens = VocabParallelEmbedding(
    vocab_size,
    hidden_size,
    org_num_embeddings=vocab_size,
    quant_config=quant_config,
)

norm instance-attribute

norm = RMSNorm(hidden_size, eps=rms_norm_eps)

org_vocab_size instance-attribute

org_vocab_size = vocab_size

padding_idx instance-attribute

padding_idx = pad_token_id

quant_config instance-attribute

quant_config = quant_config

vocab_size instance-attribute

vocab_size = vocab_size + lora_vocab

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config
    lora_config = vllm_config.lora_config

    self.config = config
    self.quant_config = quant_config
    self.padding_idx = config.pad_token_id
    lora_vocab = ((lora_config.lora_extra_vocab_size *
                   (lora_config.max_loras or 1)) if lora_config else 0)
    self.vocab_size = config.vocab_size + lora_vocab
    self.org_vocab_size = config.vocab_size
    if get_pp_group().is_first_rank or (config.tie_word_embeddings
                                        and get_pp_group().is_last_rank):
        self.embed_tokens = VocabParallelEmbedding(
            self.vocab_size,
            config.hidden_size,
            org_num_embeddings=config.vocab_size,
            quant_config=quant_config,
        )
    else:
        self.embed_tokens = PPMissingLayer()
    self.start_layer, self.end_layer, self.layers = make_layers(
        config.num_hidden_layers,
        lambda prefix: HunYuanDecoderLayer(
            config=config,
            layer_id=int(prefix.split(".")[-1]),
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=prefix,
        ),
        prefix=f"{prefix}.layers",
    )
    if get_pp_group().is_last_rank:
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    else:
        self.norm = PPMissingLayer()

forward

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[IntermediateTensors],
    inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors],
    inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    if get_pp_group().is_first_rank:
        if inputs_embeds is not None:
            hidden_states = inputs_embeds
        else:
            hidden_states = self.get_input_embeddings(input_ids)
        residual = None
    else:
        assert intermediate_tensors is not None
        hidden_states = intermediate_tensors["hidden_states"]
        residual = intermediate_tensors["residual"]

    cla_factor = _get_cla_factor(self.config)
    prev_kv_states = None
    for i in range(self.start_layer, self.end_layer):
        layer = self.layers[i]
        hidden_states, residual, kv_states = layer(
            positions,
            hidden_states,
            residual,
            prev_kv_states,
        )

        if (getattr(self.config, "use_cla", False)
                and (i - self.start_layer) % cla_factor == 0):
            prev_kv_states = kv_states
        else:
            prev_kv_states = None

    if not get_pp_group().is_last_rank:
        return IntermediateTensors({
            "hidden_states": hidden_states,
            "residual": residual
        })

    hidden_states, _ = self.norm(hidden_states, residual)
    return hidden_states

get_input_embeddings

get_input_embeddings(input_ids: Tensor) -> Tensor
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
    return self.embed_tokens(input_ids)

HunYuanSparseMoeBlock

Bases: Module

Source code in vllm/model_executor/models/hunyuan_v1_moe.py
class HunYuanSparseMoeBlock(nn.Module):

    def __init__(
        self,
        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        layer_id: int = -1,
        prefix: str = "",
    ):
        super().__init__()
        self.tp_size = get_tensor_model_parallel_world_size()

        if self.tp_size > config.num_experts:
            raise ValueError(
                f"Tensor parallel size {self.tp_size} is greater than "
                f"the number of experts {config.num_experts}.")

        # Get layer_id topk if config.moe_topk is a list
        if isinstance(config.moe_topk, list):
            assert layer_id >= 0
            assert len(config.moe_topk) > layer_id
            top_k = config.moe_topk[layer_id]
        else:
            top_k = config.moe_topk

        # If it is moe, moe_intermediate_size is preferred
        intermediate_size = config.intermediate_size
        if config.moe_intermediate_size is not None:
            intermediate_size = (config.moe_intermediate_size if isinstance(
                config.moe_intermediate_size, int) else
                                 config.moe_intermediate_size[layer_id])

        self.experts = FusedMoE(
            num_experts=config.num_experts,
            top_k=top_k,
            hidden_size=config.hidden_size,
            intermediate_size=intermediate_size,
            reduce_results=False,
            renormalize=top_k > 1,
            quant_config=quant_config,
            prefix=f"{prefix}.experts",
        )

        self.gate = ReplicatedLinear(config.hidden_size,
                                     config.num_experts,
                                     bias=False,
                                     quant_config=None,
                                     prefix=f"{prefix}.gate")
        if config.use_mixed_mlp_moe > 0:
            # Get layer_id num_shared_expert if config.num_shared_expert is
            # a list.
            if isinstance(config.num_shared_expert, list):
                assert layer_id >= 0
                assert len(config.num_shared_expert) > layer_id
                num_shared_expert = config.num_shared_expert[layer_id]
            else:
                num_shared_expert = config.num_shared_expert

            self.shared_mlp = HunYuanMLP(
                hidden_size=config.hidden_size,
                intermediate_size=config.intermediate_size * num_shared_expert,
                hidden_act=config.hidden_act,
                quant_config=quant_config,
                reduce_results=False,
            )
        else:
            self.shared_mlp = None

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # NOTE: hidden_states can have either 1D or 2D shape.
        orig_shape = hidden_states.shape
        hidden_dim = hidden_states.shape[-1]
        hidden_states = hidden_states.view(-1, hidden_dim)
        shared_output = None
        if self.shared_mlp is not None:
            shared_output = self.shared_mlp(hidden_states)

        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)
        final_hidden_states = self.experts(hidden_states=hidden_states,
                                           router_logits=router_logits)
        if shared_output is not None:
            final_hidden_states = final_hidden_states + shared_output
        if self.tp_size > 1:
            final_hidden_states = tensor_model_parallel_all_reduce(
                final_hidden_states)

        return final_hidden_states.view(orig_shape)

experts instance-attribute

experts = FusedMoE(
    num_experts=num_experts,
    top_k=top_k,
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    reduce_results=False,
    renormalize=top_k > 1,
    quant_config=quant_config,
    prefix=f"{prefix}.experts",
)

gate instance-attribute

gate = ReplicatedLinear(
    hidden_size,
    num_experts,
    bias=False,
    quant_config=None,
    prefix=f"{prefix}.gate",
)

shared_mlp instance-attribute

shared_mlp = HunYuanMLP(
    hidden_size=hidden_size,
    intermediate_size=intermediate_size * num_shared_expert,
    hidden_act=hidden_act,
    quant_config=quant_config,
    reduce_results=False,
)

tp_size instance-attribute

__init__

__init__(
    config: PretrainedConfig,
    quant_config: Optional[QuantizationConfig] = None,
    layer_id: int = -1,
    prefix: str = "",
)
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def __init__(
    self,
    config: PretrainedConfig,
    quant_config: Optional[QuantizationConfig] = None,
    layer_id: int = -1,
    prefix: str = "",
):
    super().__init__()
    self.tp_size = get_tensor_model_parallel_world_size()

    if self.tp_size > config.num_experts:
        raise ValueError(
            f"Tensor parallel size {self.tp_size} is greater than "
            f"the number of experts {config.num_experts}.")

    # Get layer_id topk if config.moe_topk is a list
    if isinstance(config.moe_topk, list):
        assert layer_id >= 0
        assert len(config.moe_topk) > layer_id
        top_k = config.moe_topk[layer_id]
    else:
        top_k = config.moe_topk

    # If it is moe, moe_intermediate_size is preferred
    intermediate_size = config.intermediate_size
    if config.moe_intermediate_size is not None:
        intermediate_size = (config.moe_intermediate_size if isinstance(
            config.moe_intermediate_size, int) else
                             config.moe_intermediate_size[layer_id])

    self.experts = FusedMoE(
        num_experts=config.num_experts,
        top_k=top_k,
        hidden_size=config.hidden_size,
        intermediate_size=intermediate_size,
        reduce_results=False,
        renormalize=top_k > 1,
        quant_config=quant_config,
        prefix=f"{prefix}.experts",
    )

    self.gate = ReplicatedLinear(config.hidden_size,
                                 config.num_experts,
                                 bias=False,
                                 quant_config=None,
                                 prefix=f"{prefix}.gate")
    if config.use_mixed_mlp_moe > 0:
        # Get layer_id num_shared_expert if config.num_shared_expert is
        # a list.
        if isinstance(config.num_shared_expert, list):
            assert layer_id >= 0
            assert len(config.num_shared_expert) > layer_id
            num_shared_expert = config.num_shared_expert[layer_id]
        else:
            num_shared_expert = config.num_shared_expert

        self.shared_mlp = HunYuanMLP(
            hidden_size=config.hidden_size,
            intermediate_size=config.intermediate_size * num_shared_expert,
            hidden_act=config.hidden_act,
            quant_config=quant_config,
            reduce_results=False,
        )
    else:
        self.shared_mlp = None

forward

forward(hidden_states: Tensor) -> Tensor
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    # NOTE: hidden_states can have either 1D or 2D shape.
    orig_shape = hidden_states.shape
    hidden_dim = hidden_states.shape[-1]
    hidden_states = hidden_states.view(-1, hidden_dim)
    shared_output = None
    if self.shared_mlp is not None:
        shared_output = self.shared_mlp(hidden_states)

    # router_logits: (num_tokens, n_experts)
    router_logits, _ = self.gate(hidden_states)
    final_hidden_states = self.experts(hidden_states=hidden_states,
                                       router_logits=router_logits)
    if shared_output is not None:
        final_hidden_states = final_hidden_states + shared_output
    if self.tp_size > 1:
        final_hidden_states = tensor_model_parallel_all_reduce(
            final_hidden_states)

    return final_hidden_states.view(orig_shape)

_get_cla_factor

_get_cla_factor(config: PretrainedConfig) -> int
Source code in vllm/model_executor/models/hunyuan_v1_moe.py
def _get_cla_factor(config: PretrainedConfig) -> int:
    if not getattr(config, "use_cla", False):
        return 1
    return getattr(config, "cla_share_factor", 1)