@support_torch_compile
class DeepSeekV4MTP(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.config = vllm_config.model_config.hf_config
self.quant_config = vllm_config.quant_config
self.model = DeepSeekV4MultiTokenPredictor(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)
def forward(
self,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
hidden_states: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
spec_step_idx: int = 0,
) -> torch.Tensor:
hidden_states = self.model(
input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
spec_step_idx: int = 0,
) -> torch.Tensor | None:
return self.model.compute_logits(hidden_states, spec_step_idx)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
# Weight name remapping for checkpoint compatibility.
# Maps checkpoint weight paths to model parameter paths.
WEIGHT_NAME_REMAPPING: dict[str, str] = {
".emb.tok_emb.weight": ".embed_tokens.weight",
".head.weight": ".shared_head.head.weight",
".norm.weight": ".shared_head.norm.weight",
}
def _remap_weight_name(name: str) -> str:
"""Remap checkpoint weight names to model parameter names."""
for old_pattern, new_pattern in WEIGHT_NAME_REMAPPING.items():
if old_pattern in name:
name = name.replace(old_pattern, new_pattern)
return name
def _find_mtp_layer_idx(name: str) -> int:
subnames = name.split(".")
for subname in subnames:
try:
# we return the first encountered integer
return int(subname)
except ValueError:
continue
return 0
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "w1", 0),
("gate_up_proj", "w3", 1),
("attn.fused_wqa_wkv", "attn.wq_a", 0),
("attn.fused_wqa_wkv", "attn.wkv", 1),
]
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
# TP for attention
tp_size = get_tensor_model_parallel_world_size()
tp_rank = get_tensor_model_parallel_rank()
n_head = self.config.num_attention_heads
n_local_head = n_head // tp_size
head_rank_start = n_local_head * tp_rank
head_rank_end = n_local_head * (tp_rank + 1)
# Pre-compute expert mapping ONCE.
first_layer = next(iter(self.model.layers.values()))
if first_layer.mtp_block.ffn.use_mega_moe:
expert_mapping = make_deepseek_v4_expert_params_mapping(
self.config.n_routed_experts
)
else:
expert_mapping = FusedMoE.make_expert_params_mapping(
self,
ckpt_gate_proj_name="w1",
ckpt_down_proj_name="w2",
ckpt_up_proj_name="w3",
num_experts=self.config.n_routed_experts,
)
# FP8 experts register ``..._weight_scale_inv`` (block_quant) while
# FP4/MXFP4 experts register ``..._weight_scale``. Choose the suffix
# for the rename below based on the model's expert dtype.
expert_scale_suffix = (
".weight_scale"
if getattr(self.config, "expert_dtype", "fp4") == "fp4"
else ".weight_scale_inv"
)
for name, loaded_weight in weights:
mtp_layer_idx = _find_mtp_layer_idx(name)
# V4 checkpoints store MTP weights as `mtp.{i}.*`; remap to
# `model.layers.{num_hidden_layers + i}.*` so that
# get_spec_layer_idx_from_weight_name can identify them.
name = name.replace(
f"mtp.{mtp_layer_idx}.",
f"model.layers.{self.config.num_hidden_layers + mtp_layer_idx}.",
)
spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
if spec_layer is None:
continue
name = _remap_weight_name(name)
name = self._rewrite_spec_layer_name(spec_layer, name)
if spec_layer != self.model.mtp_start_layer_idx and ".layers" not in name:
continue
if name.endswith(".scale"):
suffix = (
expert_scale_suffix
if _EXPERT_SCALE_RE.search(name)
else ".weight_scale_inv"
)
name = name.removesuffix(".scale") + suffix
for param_name, weight_name, shard_id in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
if ".experts." in name:
continue
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
loaded_params.add(name)
break
else:
if ".experts." in name:
# Reinterpret E8M0 scales as uint8 to preserve raw
# exponent bytes; numeric copy_() would zero them.
# Mirrors the main DeepseekV4 loader.
if (
"weight_scale" in name
and loaded_weight.dtype == torch.float8_e8m0fnu
):
loaded_weight = loaded_weight.view(torch.uint8)
for mapping in expert_mapping:
param_name, weight_name, expert_id, shard_id = mapping
if weight_name not in name:
continue
name_mapped = name.replace(weight_name, param_name)
param = params_dict[name_mapped]
# We should ask the weight loader to return success or not
# here since otherwise we may skip experts with other
# available replicas.
weight_loader = typing.cast(
Callable[..., bool], param.weight_loader
)
success = weight_loader(
param,
loaded_weight,
name_mapped,
shard_id=shard_id,
expert_id=expert_id,
return_success=True,
)
if success:
name = name_mapped
loaded_params.add(name_mapped)
break
continue
elif "attn_sink" in name:
narrow_weight = loaded_weight[head_rank_start:head_rank_end]
n = narrow_weight.shape[0]
params_dict[name][:n].copy_(narrow_weight)
loaded_params.add(name)
continue
else:
if ".shared_experts.w2" in name:
name = name.replace(
".shared_experts.w2", ".shared_experts.down_proj"
)
if name.endswith(".ffn.gate.bias"):
name = name.replace(".bias", ".e_score_correction_bias")
param = params_dict[name]
weight_loader = getattr(
param, "weight_loader", default_weight_loader
)
weight_loader(param, loaded_weight)
loaded_params.add(name)
continue
loaded_layers: set[int] = set()
for param_name in loaded_params:
spec_layer = get_spec_layer_idx_from_weight_name(self.config, param_name)
if spec_layer is not None:
loaded_layers.add(spec_layer)
for layer_idx in range(
self.model.mtp_start_layer_idx,
self.model.mtp_start_layer_idx + self.model.num_mtp_layers,
):
if layer_idx not in loaded_layers:
raise ValueError(
f"MTP speculative decoding layer {layer_idx} weights "
f"missing from checkpoint. The checkpoint may have "
f"been quantized without including the MTP layers. "
f"Use a checkpoint that includes MTP layer weights, "
f"or disable speculative decoding."
)
self.finalize_mega_moe_weights()
logger.info_once("MTP draft model loaded: %d params", len(loaded_params))
return loaded_params
def finalize_mega_moe_weights(self) -> None:
for layer in self.model.layers.values():
layer.mtp_block.ffn.finalize_mega_moe_weights()
def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
"""
Rewrite the weight name to match the format of the original model.
Add .mtp_block for modules in transformer layer block for spec layer
and rename shared layer weights to be top level.
"""
spec_layer_weight_names = [
"embed_tokens",
"enorm",
"hnorm",
"h_proj",
"e_proj",
"shared_head",
"hc_head_fn",
"hc_head_base",
"hc_head_scale",
]
shared_weight_names = ["embed_tokens"]
spec_layer_weight = False
shared_weight = False
for weight_name in spec_layer_weight_names:
if weight_name in name:
spec_layer_weight = True
if weight_name in shared_weight_names:
shared_weight = True
break
if not spec_layer_weight:
# treat rest weights as weights for transformer layer block
name = name.replace(
f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
)
elif shared_weight:
# treat shared weights as top level weights
name = name.replace(f"model.layers.{spec_layer}.", "model.")
return name