Skip to content

vllm.model_executor.models.moondream3

Inference-only Moondream3 model implementation.

Moondream3Attention

Bases: Module

Decoder attention with RoPE and tau scaling.

Moondream3 uses a tau attention mechanism that scales Q and V based on both token content and position.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3Attention(nn.Module):
    """Decoder attention with RoPE and tau scaling.

    Moondream3 uses a tau attention mechanism that scales Q and V
    based on both token content and position.
    """

    def __init__(
        self,
        config: Moondream3TextConfig,
        layer_idx: int,
        cache_config=None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.hidden_size = config.dim
        self.num_heads = config.n_heads
        self.num_kv_heads = config.n_kv_heads
        self.head_dim = config.dim // config.n_heads

        tp_size = get_tensor_model_parallel_world_size()
        self.num_heads_per_partition = self.num_heads // tp_size
        self.num_kv_heads_per_partition = max(1, self.num_kv_heads // tp_size)

        self.qkv_proj = QKVParallelLinear(
            hidden_size=self.hidden_size,
            head_size=self.head_dim,
            total_num_heads=self.num_heads,
            total_num_kv_heads=self.num_kv_heads,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )

        self.out_proj = RowParallelLinear(
            input_size=self.hidden_size,
            output_size=self.hidden_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.out_proj",
        )

        # Moondream uses 32-dim rotation out of 64-dim head (partial_rotary_factor=0.5)
        # HF Moondream uses non-interleaved RoPE (split by half)
        # In vLLM, is_neox_style=True means split by half (GPT-NeoX style)
        rope_parameters = {
            "rope_theta": config.rope_theta,
            "partial_rotary_factor": 32 / self.head_dim,  # 32/64 = 0.5
        }
        self.rotary_emb = get_rope(
            head_size=self.head_dim,
            max_position=config.max_context,
            rope_parameters=rope_parameters,
            is_neox_style=True,  # Moondream uses split-by-half (GPT-NeoX) style
        )

        self.scaling = self.head_dim**-0.5
        self.attn = Attention(
            num_heads=self.num_heads_per_partition,
            head_size=self.head_dim,
            scale=self.scaling,
            num_kv_heads=self.num_kv_heads_per_partition,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )

        # Tau scaling parameters for position-dependent attention
        # These are learned during training to modulate attention based on position
        # tau_wq and tau_wv need full qkv_dim for correct computation
        # Only heads are partitioned, qkv dimension is kept full for all-gather
        qkv_dim = self.hidden_size * 3  # Q + K + V dimension (full)
        self.tau_alpha = nn.Parameter(torch.zeros(self.num_heads_per_partition))
        self.tau_wq = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
        self.tau_wv = nn.Parameter(torch.zeros(self.num_heads_per_partition, qkv_dim))
        self.tp_size = tp_size

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)

        q, k, v = qkv.split(
            [
                self.num_heads_per_partition * self.head_dim,
                self.num_kv_heads_per_partition * self.head_dim,
                self.num_kv_heads_per_partition * self.head_dim,
            ],
            dim=-1,
        )

        # Apply tau scaling to Q and V
        # Tau scaling has two components:
        # 1. Token-based: tok_q = tanh(gelu(qkv) @ tau_wq.T)
        # 2. Position-based: tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
        # Final: tau = tok + tau_pos
        #
        # For TP, tau weights are sharded by head, but qkv_dim is kept full

        # Get full qkv for tau computation
        # With TP, reconstruct qkv in correct layout [q_full, k_full, v_full]
        # (all-gather would produce [q_0, k_0, v_0, q_1, k_1, v_1] - wrong)
        if self.tp_size > 1:
            # All-gather once, then reconstruct [q_full, k_full, v_full].
            qkv_full_sharded = tensor_model_parallel_all_gather(qkv.contiguous())
            q_local_dim = q.shape[-1]
            kv_local_dim = k.shape[-1]
            qkv_full_sharded = qkv_full_sharded.view(
                qkv.shape[0],
                self.tp_size,
                q_local_dim + 2 * kv_local_dim,
            )
            q_full = qkv_full_sharded[:, :, :q_local_dim].reshape(qkv.shape[0], -1)
            k_full = qkv_full_sharded[
                :, :, q_local_dim : q_local_dim + kv_local_dim
            ].reshape(qkv.shape[0], -1)
            v_full = qkv_full_sharded[:, :, q_local_dim + kv_local_dim :].reshape(
                qkv.shape[0], -1
            )
            qkv_full = torch.cat([q_full, k_full, v_full], dim=-1).contiguous()
        else:
            qkv_full = qkv

        # Compute tau scaling factors matching HF implementation exactly:
        # tok_feat = gelu(qkv)
        # tok_q = tanh(tok_feat @ tau_wq.T)  # [num_tokens, num_heads]
        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)  # [num_heads, num_tokens]
        # tau = (tok_q.T + tau_pos).T  # [num_tokens, num_heads]
        num_tokens = qkv_full.shape[0]
        orig_dtype = q.dtype

        # Token-based component
        tok_feat = F.gelu(qkv_full)  # Apply GELU activation
        tok_q = torch.tanh(tok_feat @ self.tau_wq.t())  # [N, H_per_partition]
        tok_v = torch.tanh(tok_feat @ self.tau_wv.t())  # [N, H_per_partition]

        # Position-based component
        # tau_pos = 1 + (sigmoid(alpha * log(pos+1)) - 0.5)
        # positions is [num_tokens], need to compute for each head
        # tau_alpha: [num_heads_per_partition]
        pos_float = (positions.to(orig_dtype) + 1.0).clamp(min=1e-6)
        pos_log = pos_float.log()  # [num_tokens]
        # alpha[:, None] * pos_log[None, :] -> [num_heads, num_tokens]
        tau_pos = 1.0 + (
            torch.sigmoid(self.tau_alpha[:, None] * pos_log[None, :]) - 0.5
        )  # [H_per_partition, N]

        # Combine token and position components
        tau_q = (tok_q + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]
        tau_v = (tok_v + tau_pos.t()).to(orig_dtype)  # [N, H_per_partition]

        # Reshape q and v to apply per-head tau scaling
        q = q.view(num_tokens, self.num_heads_per_partition, self.head_dim)
        v = v.view(num_tokens, self.num_kv_heads_per_partition, self.head_dim)

        # Apply tau scaling
        q = q * tau_q.unsqueeze(-1)
        v = v * tau_v[:, : self.num_kv_heads_per_partition].unsqueeze(-1)

        # Reshape back
        q = q.view(num_tokens, -1)
        v = v.view(num_tokens, -1)

        q, k = self.rotary_emb(positions, q, k)

        attn_output = self.attn(q, k, v)

        output, _ = self.out_proj(attn_output)
        return output

Moondream3DecoderLayer

Bases: Module

Decoder layer with attention + MLP/MoE.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3DecoderLayer(nn.Module):
    """Decoder layer with attention + MLP/MoE."""

    def __init__(
        self,
        config: Moondream3TextConfig,
        cache_config=None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        layer_idx = extract_layer_index(prefix)
        self.layer_idx = layer_idx

        self.ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)

        self.attn = Moondream3Attention(
            config=config,
            layer_idx=layer_idx,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )

        # Use MoE for layers >= moe_start_layer, standard MLP otherwise
        if layer_idx >= config.moe_start_layer:
            self.mlp = Moondream3TextMoE(
                hidden_size=config.dim,
                expert_inner_dim=config.moe_expert_inner_dim,
                num_experts=config.moe_num_experts,
                experts_per_token=config.moe_experts_per_token,
                quant_config=quant_config,
                prefix=f"{prefix}.mlp",
            )
        else:
            self.mlp = Moondream3TextMLP(
                hidden_size=config.dim,
                intermediate_size=config.ff_dim,
                quant_config=quant_config,
                prefix=f"{prefix}.mlp",
            )

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        # Pre-norm architecture
        normed = self.ln(hidden_states)
        attn_out = self.attn(positions, normed)
        mlp_out = self.mlp(normed)
        hidden_states = hidden_states + attn_out + mlp_out
        return hidden_states

Moondream3DummyInputsBuilder

Bases: BaseDummyInputsBuilder[Moondream3ProcessingInfo]

Dummy inputs builder for profiling.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3DummyInputsBuilder(BaseDummyInputsBuilder[Moondream3ProcessingInfo]):
    """Dummy inputs builder for profiling."""

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        return (
            "<|endoftext|><image><|md_reserved_0|>query<|md_reserved_1|>"
            "What is this image?<|md_reserved_2|>"
        )

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Mapping[str, BaseDummyOptions] | None = None,
        mm_processor_kwargs: Mapping[str, object] | None = None,
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)
        return {
            "image": self._get_dummy_images(
                width=378,
                height=378,
                num_images=num_images,
            )
        }

Moondream3ForCausalLM

Bases: Module, SupportsMultiModal, SupportsPP

Moondream3 multimodal model for causal language modeling.

vLLM supports the standard autoregressive Moondream3 query and caption prompt formats. The region-module point/detect skills require custom coordinate decoding and are intentionally not exposed here.

Source code in vllm/model_executor/models/moondream3.py
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
@MULTIMODAL_REGISTRY.register_processor(
    Moondream3MultiModalProcessor,
    info=Moondream3ProcessingInfo,
    dummy_inputs=Moondream3DummyInputsBuilder,
)
class Moondream3ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
    """Moondream3 multimodal model for causal language modeling.

    vLLM supports the standard autoregressive Moondream3 query and caption
    prompt formats. The region-module point/detect skills require custom
    coordinate decoding and are intentionally not exposed here.
    """

    supports_multimodal = True
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
    }

    def __init__(
        self,
        *,
        vllm_config: VllmConfig,
        prefix: str = "",
    ):
        super().__init__()

        hf_config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        cache_config = vllm_config.cache_config

        # Reuse the transformers_utils config implementation.
        if isinstance(hf_config, Moondream3Config):
            self.config = hf_config
        else:
            config_dict = hf_config.config if hasattr(hf_config, "config") else {}
            self.config = Moondream3Config(config=config_dict)

        with self._mark_tower_model(vllm_config, "image"):
            # Vision encoder
            self.vision = Moondream3VisionEncoder(
                config=self.config.vision_config,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "vision"),
            )

            # Vision projection
            self.vision_proj = Moondream3VisionProjection(
                input_dim=self.config.vision_config.enc_dim,
                inner_dim=self.config.vision_config.proj_inner_dim,
                output_dim=self.config.text_config.dim,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "vision_proj"),
            )

        with self._mark_language_model(vllm_config):
            # Text decoder
            self.text = Moondream3TextModel(
                config=self.config.text_config,
                cache_config=cache_config,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "text"),
            )

            # LM head (with bias - Moondream3 has lm_head bias)
            self.lm_head = ParallelLMHead(
                self.config.text_config.vocab_size,
                self.config.text_config.dim,
                bias=True,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "lm_head"),
            )

        self.logits_processor = LogitsProcessor(self.config.text_config.vocab_size)
        self.make_empty_intermediate_tensors = self.text.make_empty_intermediate_tensors
        self._answer_id = getattr(
            self.config,
            "answer_token_id",
            getattr(hf_config, "answer_token_id", 3),
        )

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
        if modality == "image":
            return "<image>"
        return None

    def get_language_model(self) -> nn.Module:
        return self.text

    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
        return num_image_tokens

    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
        return num_vision_tokens

    def _split_pixel_values(
        self,
        pixel_values: object,
    ) -> list[torch.Tensor]:
        # The processor should standardize image inputs into:
        # - torch.Tensor [num_images, num_crops, C, H, W], or
        # - list[torch.Tensor[num_crops, C, H, W]] for ragged crops.
        if isinstance(pixel_values, torch.Tensor):
            if pixel_values.dim() != 5:
                raise ValueError(
                    "Expected `pixel_values` tensor with shape "
                    "[num_images, num_crops, C, H, W], got "
                    f"{tuple(pixel_values.shape)}."
                )
            return [pv.contiguous() for pv in pixel_values]

        if isinstance(pixel_values, (list, tuple)):
            tensors: list[torch.Tensor] = []
            for value in pixel_values:
                if not isinstance(value, torch.Tensor):
                    raise TypeError(
                        "Expected each `pixel_values` element to be a tensor, "
                        f"got {type(value)!r}."
                    )
                if value.dim() != 4:
                    raise ValueError(
                        f"Unsupported pixel_values element shape {tuple(value.shape)}."
                    )
                tensors.append(value.contiguous())
            return tensors

        raise TypeError(
            "pixel_values must be a tensor or a sequence of tensors, "
            f"got {type(pixel_values)!r}."
        )

    def _split_tilings(
        self,
        tilings: object,
        expected: int,
    ) -> list[tuple[int, int] | None]:
        if tilings is None:
            return [None] * expected

        if isinstance(tilings, torch.Tensor):
            if tilings.dim() != 2 or tilings.shape[1] != 2:
                raise ValueError(
                    "Expected `tilings` tensor with shape [num_images, 2], got "
                    f"{tuple(tilings.shape)}."
                )
            tiling_items = tilings.tolist()
        elif isinstance(tilings, (list, tuple)):
            tiling_items = list(tilings)
        else:
            raise TypeError(
                "tilings must be None, a tensor or a sequence of tuples, "
                f"got {type(tilings)!r}."
            )

        if len(tiling_items) != expected:
            raise ValueError(
                "Mismatch between the number of pixel_values entries "
                f"({expected}) and tilings ({len(tiling_items)})."
            )

        normalized: list[tuple[int, int] | None] = []
        for tiling in tiling_items:
            if tiling is None:
                normalized.append(None)
                continue
            if isinstance(tiling, torch.Tensor):
                tiling = tiling.tolist()
            if isinstance(tiling, (list, tuple)) and len(tiling) == 2:
                normalized.append((int(tiling[0]), int(tiling[1])))
            else:
                raise ValueError(
                    f"Each tiling entry must be a pair of integers, got {tiling!r}."
                )
        return normalized

    def _parse_image_inputs(self, **kwargs: object) -> list[Moondream3ImageInput]:
        pixel_values = kwargs.get("pixel_values")
        if pixel_values is None:
            return []

        pixel_values_list = self._split_pixel_values(pixel_values)
        tilings_list = self._split_tilings(
            kwargs.get("tilings"), len(pixel_values_list)
        )

        image_inputs: list[Moondream3ImageInput] = []
        for value, tiling in zip(pixel_values_list, tilings_list):
            if value.dim() != 4:
                raise ValueError(
                    f"Expected 4D tensor for crops, got {tuple(value.shape)}."
                )
            image_inputs.append(Moondream3ImageInput(pixel_values=value, tiling=tiling))
        return image_inputs

    def _encode_image_input(self, image_input: Moondream3ImageInput) -> torch.Tensor:
        pixel_values = image_input.pixel_values
        if pixel_values.dim() != 4:
            raise ValueError(
                f"Expected 4D tensor for crops, got {tuple(pixel_values.shape)}."
            )

        device = self.vision.patch_emb.weight.device
        dtype = self.vision.patch_emb.weight.dtype
        pixel_values = pixel_values.to(device=device, dtype=dtype)

        features = self.vision(pixel_values)

        # Grid size = crop_size / patch_size (e.g., 378 / 14 = 27)
        grid_size = (
            self.config.vision_config.crop_size
            // self.config.vision_config.enc_patch_size
        )
        enc_dim = self.config.vision_config.enc_dim
        global_features = features[0]

        if features.shape[0] > 1:
            if image_input.tiling is None:
                raise ValueError(
                    "Missing tiling metadata for multi-crop Moondream image."
                )
            local = features[1:].contiguous().view(-1, grid_size, grid_size, enc_dim)
            reconstructed = reconstruct_from_crops(
                local,
                image_input.tiling,
                overlap_margin=self.config.vision_config.overlap_margin,
                patch_size=1,
            )
        else:
            reconstructed = global_features.view(grid_size, grid_size, enc_dim)

        recon = reconstructed.permute(2, 0, 1).contiguous()
        # Mirror HF reference behavior: reconstructed local features are pooled
        # to enc_n_layers x enc_n_layers. For moondream3-preview this is 27x27.
        pooled_size = self.config.vision_config.enc_n_layers
        if pooled_size != grid_size:
            logger.warning_once(
                "Moondream3 pooled_size (%d) differs from crop grid (%d). "
                "Using enc_n_layers to match HF reference behavior.",
                pooled_size,
                grid_size,
            )
        recon = F.adaptive_avg_pool2d(recon, output_size=(pooled_size, pooled_size))
        recon = recon.permute(1, 2, 0).contiguous().view(-1, enc_dim)

        combined = torch.cat([global_features, recon], dim=-1).unsqueeze(0)
        projected = self.vision_proj(combined).squeeze(0)

        # Note: Vision embeddings are already synchronized across TP ranks
        # because the vision projection uses RowParallelLinear which performs
        # all-reduce internally, ensuring identical outputs on all ranks.

        return projected

    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        """Generate the HF image prefix: BOS embedding + 729 image embeddings."""
        image_inputs = self._parse_image_inputs(**kwargs)
        if not image_inputs:
            return []

        device = self.vision.patch_emb.weight.device
        bos_ids = torch.tensor([self.config.bos_token_id], device=device)
        bos_embedding = self.text.embed_input_ids(bos_ids)

        embeddings: list[torch.Tensor] = []
        for image_input in image_inputs:
            image_embeddings = self._encode_image_input(image_input)
            embeddings.append(
                torch.cat([bos_embedding.to(image_embeddings.dtype), image_embeddings])
            )
        return embeddings

    def forward(
        self,
        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs,
    ) -> torch.Tensor | IntermediateTensors:
        hidden_states = self.text(
            input_ids=input_ids,
            positions=positions,
            intermediate_tensors=intermediate_tensors,
            inputs_embeds=inputs_embeds,
        )
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor | None:
        logits = self.logits_processor(self.lm_head, hidden_states)
        if logits is not None:
            logits[:, self._answer_id] = float("-inf")
        return logits

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        """Load weights with remapping from HuggingFace format."""

        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()

        # Get expert intermediate size for fc1 splitting

        for name, loaded_weight in weights:
            # Map from HF naming to vLLM naming
            # model.vision.* -> vision.*
            # model.text.* -> text.*
            if name.startswith("model."):
                name = name[6:]  # Remove "model." prefix

            # Specific name mappings
            # Vision projection: vision.proj_mlp.fc1 -> vision_proj.fc1
            name = name.replace("vision.proj_mlp.", "vision_proj.")

            # Text embedding: text.wte (no suffix) -> text.wte.weight
            if name == "text.wte":
                name = "text.wte.weight"

            # LM head: text.lm_head -> lm_head
            name = name.replace("text.lm_head.", "lm_head.")

            # Attention mapping
            name = name.replace(".attn.qkv.", ".attn.qkv_proj.")
            name = name.replace(".attn.proj.", ".attn.out_proj.")

            # Tau attention scaling weights
            # HF format: .attn.tau.alpha -> .attn.tau_alpha
            name = name.replace(".attn.tau.alpha", ".attn.tau_alpha")
            name = name.replace(".attn.tau.wq", ".attn.tau_wq")
            name = name.replace(".attn.tau.wv", ".attn.tau_wv")

            # MoE router mapping: mlp.router -> mlp.gate
            name = name.replace(".mlp.router.", ".mlp.gate.")

            # Handle MoE expert weights for layers 4+ with expert parallelism
            # fc1.weight: [n_experts, expert_inner_dim * 2, hidden_size] (gate+up)
            # fc2.weight: [n_experts, hidden_size, expert_inner_dim] (down)
            # Each GPU stores n_experts/tp_size experts
            # Note: Only 3D weights are MoE, 2D weights are standard MLP
            if ".mlp.fc1.weight" in name and loaded_weight.dim() == 3:
                from vllm.distributed import get_tensor_model_parallel_rank

                tp_size = get_tensor_model_parallel_world_size()
                tp_rank = get_tensor_model_parallel_rank()
                num_experts = loaded_weight.shape[0]
                experts_per_rank = num_experts // tp_size
                expert_start = tp_rank * experts_per_rank
                expert_end = expert_start + experts_per_rank
                # Shard by expert dimension
                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
                # Map to our custom MoE format: mlp.fc1_weight
                name = name.replace(".mlp.fc1.weight", ".mlp.fc1_weight")

            if ".mlp.fc2.weight" in name and loaded_weight.dim() == 3:
                from vllm.distributed import get_tensor_model_parallel_rank

                tp_size = get_tensor_model_parallel_world_size()
                tp_rank = get_tensor_model_parallel_rank()
                num_experts = loaded_weight.shape[0]
                experts_per_rank = num_experts // tp_size
                expert_start = tp_rank * experts_per_rank
                expert_end = expert_start + experts_per_rank
                # Shard by expert dimension
                loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
                # Map to our custom MoE format: mlp.fc2_weight
                name = name.replace(".mlp.fc2.weight", ".mlp.fc2_weight")

            # Handle tau weights with tensor parallelism
            # tau_alpha: [num_heads] -> [num_heads/tp]
            # tau_wq: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
            # tau_wv: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
            if ".tau_alpha" in name:
                from vllm.distributed import get_tensor_model_parallel_rank

                tp_size = get_tensor_model_parallel_world_size()
                tp_rank = get_tensor_model_parallel_rank()
                num_heads = loaded_weight.shape[0]
                heads_per_partition = num_heads // tp_size
                start = tp_rank * heads_per_partition
                end = start + heads_per_partition
                loaded_weight = loaded_weight[start:end].contiguous()

            if ".tau_wq" in name or ".tau_wv" in name:
                from vllm.distributed import get_tensor_model_parallel_rank

                tp_size = get_tensor_model_parallel_world_size()
                tp_rank = get_tensor_model_parallel_rank()
                num_heads, qkv_dim = loaded_weight.shape
                heads_per_partition = num_heads // tp_size
                # Only shard by head dimension, keep full qkv_dim for all-gather
                head_start = tp_rank * heads_per_partition
                head_end = head_start + heads_per_partition
                loaded_weight = loaded_weight[head_start:head_end, :].contiguous()

            if name in params_dict:
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)
                loaded_params.add(name)

        return loaded_params

embed_multimodal

embed_multimodal(**kwargs: object) -> MultiModalEmbeddings

Generate the HF image prefix: BOS embedding + 729 image embeddings.

Source code in vllm/model_executor/models/moondream3.py
def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
    """Generate the HF image prefix: BOS embedding + 729 image embeddings."""
    image_inputs = self._parse_image_inputs(**kwargs)
    if not image_inputs:
        return []

    device = self.vision.patch_emb.weight.device
    bos_ids = torch.tensor([self.config.bos_token_id], device=device)
    bos_embedding = self.text.embed_input_ids(bos_ids)

    embeddings: list[torch.Tensor] = []
    for image_input in image_inputs:
        image_embeddings = self._encode_image_input(image_input)
        embeddings.append(
            torch.cat([bos_embedding.to(image_embeddings.dtype), image_embeddings])
        )
    return embeddings

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

Load weights with remapping from HuggingFace format.

Source code in vllm/model_executor/models/moondream3.py
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
    """Load weights with remapping from HuggingFace format."""

    params_dict = dict(self.named_parameters())
    loaded_params: set[str] = set()

    # Get expert intermediate size for fc1 splitting

    for name, loaded_weight in weights:
        # Map from HF naming to vLLM naming
        # model.vision.* -> vision.*
        # model.text.* -> text.*
        if name.startswith("model."):
            name = name[6:]  # Remove "model." prefix

        # Specific name mappings
        # Vision projection: vision.proj_mlp.fc1 -> vision_proj.fc1
        name = name.replace("vision.proj_mlp.", "vision_proj.")

        # Text embedding: text.wte (no suffix) -> text.wte.weight
        if name == "text.wte":
            name = "text.wte.weight"

        # LM head: text.lm_head -> lm_head
        name = name.replace("text.lm_head.", "lm_head.")

        # Attention mapping
        name = name.replace(".attn.qkv.", ".attn.qkv_proj.")
        name = name.replace(".attn.proj.", ".attn.out_proj.")

        # Tau attention scaling weights
        # HF format: .attn.tau.alpha -> .attn.tau_alpha
        name = name.replace(".attn.tau.alpha", ".attn.tau_alpha")
        name = name.replace(".attn.tau.wq", ".attn.tau_wq")
        name = name.replace(".attn.tau.wv", ".attn.tau_wv")

        # MoE router mapping: mlp.router -> mlp.gate
        name = name.replace(".mlp.router.", ".mlp.gate.")

        # Handle MoE expert weights for layers 4+ with expert parallelism
        # fc1.weight: [n_experts, expert_inner_dim * 2, hidden_size] (gate+up)
        # fc2.weight: [n_experts, hidden_size, expert_inner_dim] (down)
        # Each GPU stores n_experts/tp_size experts
        # Note: Only 3D weights are MoE, 2D weights are standard MLP
        if ".mlp.fc1.weight" in name and loaded_weight.dim() == 3:
            from vllm.distributed import get_tensor_model_parallel_rank

            tp_size = get_tensor_model_parallel_world_size()
            tp_rank = get_tensor_model_parallel_rank()
            num_experts = loaded_weight.shape[0]
            experts_per_rank = num_experts // tp_size
            expert_start = tp_rank * experts_per_rank
            expert_end = expert_start + experts_per_rank
            # Shard by expert dimension
            loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
            # Map to our custom MoE format: mlp.fc1_weight
            name = name.replace(".mlp.fc1.weight", ".mlp.fc1_weight")

        if ".mlp.fc2.weight" in name and loaded_weight.dim() == 3:
            from vllm.distributed import get_tensor_model_parallel_rank

            tp_size = get_tensor_model_parallel_world_size()
            tp_rank = get_tensor_model_parallel_rank()
            num_experts = loaded_weight.shape[0]
            experts_per_rank = num_experts // tp_size
            expert_start = tp_rank * experts_per_rank
            expert_end = expert_start + experts_per_rank
            # Shard by expert dimension
            loaded_weight = loaded_weight[expert_start:expert_end].contiguous()
            # Map to our custom MoE format: mlp.fc2_weight
            name = name.replace(".mlp.fc2.weight", ".mlp.fc2_weight")

        # Handle tau weights with tensor parallelism
        # tau_alpha: [num_heads] -> [num_heads/tp]
        # tau_wq: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
        # tau_wv: [num_heads, qkv_dim] -> [num_heads/tp, qkv_dim/tp]
        if ".tau_alpha" in name:
            from vllm.distributed import get_tensor_model_parallel_rank

            tp_size = get_tensor_model_parallel_world_size()
            tp_rank = get_tensor_model_parallel_rank()
            num_heads = loaded_weight.shape[0]
            heads_per_partition = num_heads // tp_size
            start = tp_rank * heads_per_partition
            end = start + heads_per_partition
            loaded_weight = loaded_weight[start:end].contiguous()

        if ".tau_wq" in name or ".tau_wv" in name:
            from vllm.distributed import get_tensor_model_parallel_rank

            tp_size = get_tensor_model_parallel_world_size()
            tp_rank = get_tensor_model_parallel_rank()
            num_heads, qkv_dim = loaded_weight.shape
            heads_per_partition = num_heads // tp_size
            # Only shard by head dimension, keep full qkv_dim for all-gather
            head_start = tp_rank * heads_per_partition
            head_end = head_start + heads_per_partition
            loaded_weight = loaded_weight[head_start:head_end, :].contiguous()

        if name in params_dict:
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader", default_weight_loader)
            weight_loader(param, loaded_weight)
            loaded_params.add(name)

    return loaded_params

Moondream3ImageInput dataclass

Container holding per-image inputs for embedding.

Source code in vllm/model_executor/models/moondream3.py
@dataclass(frozen=True)
class Moondream3ImageInput:
    """Container holding per-image inputs for embedding."""

    pixel_values: torch.Tensor
    tiling: tuple[int, int] | None

Moondream3MultiModalProcessor

Bases: BaseMultiModalProcessor[Moondream3ProcessingInfo]

Multimodal processor for Moondream3.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3MultiModalProcessor(BaseMultiModalProcessor[Moondream3ProcessingInfo]):
    """Multimodal processor for Moondream3."""

    image_placeholder: str = "<image>"
    bos_image_placeholder: str = "<|endoftext|><image>"

    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
        tok_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        # Moondream3's processor handles images directly rather than exposing a
        # separate `image_processor`, so keep the cache path on text+MM calls.
        return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)

    @cached_property
    def bos_image_placeholder_tokens(self) -> list[int]:
        tokenizer = self.info.get_tokenizer()
        token_ids = tokenizer.encode(
            self.bos_image_placeholder,
            add_special_tokens=False,
        )
        if len(token_ids) < 2:
            raise ValueError(
                "Tokenizer could not encode Moondream3 BOS/image placeholder "
                f"{self.bos_image_placeholder!r}."
            )
        return token_ids

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return {
            "pixel_values": MultiModalFieldConfig.batched("image"),
            "tilings": MultiModalFieldConfig.batched("image", keep_on_cpu=True),
        }

    def _hf_processor_applies_updates(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ) -> bool:
        # Moondream3 HF processor does NOT expand placeholder tokens.
        # vLLM expands BOS + <image> so the whole HF image prefix is marked
        # bidirectional by the multimodal prefix-LM mask.
        return False

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> list[PromptUpdate]:
        image_size = self.info.get_image_size_with_most_features()
        num_image_tokens = self.info.get_num_image_tokens(
            image_width=image_size.width,
            image_height=image_size.height,
        )
        placeholder_tokens = self.bos_image_placeholder_tokens
        bos_token = placeholder_tokens[0]
        image_token = placeholder_tokens[-1]
        return [
            PromptReplacement(
                modality="image",
                target=placeholder_tokens,
                replacement=PromptUpdateDetails(
                    full=[bos_token] + [image_token] * (num_image_tokens - 1),
                ),
            ),
        ]

Moondream3ProcessingInfo

Bases: BaseProcessingInfo

Processing info for Moondream3.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3ProcessingInfo(BaseProcessingInfo):
    """Processing info for Moondream3."""

    def get_hf_config(self):
        return self.ctx.get_hf_config()

    def get_hf_processor(self, **kwargs: object):
        return self.ctx.get_hf_processor(Moondream3Processor, **kwargs)

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": 1}

    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        # HF pre-fills BOS together with the fixed 27x27 vision grid under
        # the same bidirectional prefix mask: 1 BOS + 729 image embeddings.
        return 730

    def get_image_size_with_most_features(self) -> ImageSize:
        return ImageSize(width=378, height=378)

    def get_max_image_tokens(self) -> int:
        return 730

    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int]:
        return {"image": self.get_max_image_tokens()}

Moondream3TextMLP

Bases: Module

Standard MLP for non-MoE layers (layers 0-3).

Source code in vllm/model_executor/models/moondream3.py
class Moondream3TextMLP(nn.Module):
    """Standard MLP for non-MoE layers (layers 0-3)."""

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.fc1 = ColumnParallelLinear(
            hidden_size,
            intermediate_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc1",
        )
        self.act = get_act_fn("gelu_pytorch_tanh")
        self.fc2 = RowParallelLinear(
            intermediate_size,
            hidden_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc2",
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x, _ = self.fc1(x)
        x = self.act(x)
        x, _ = self.fc2(x)
        return x

Moondream3TextMoE

Bases: Module

Mixture of Experts layer for layers 4+ with expert parallelism.

Moondream3 uses a custom GeGLU activation: gelu(h) * (g + 1) where fc1 outputs [gate, up] and the activation is gelu(gate) * (up + 1).

Uses expert parallelism where each GPU stores num_experts/tp_size experts. Routing and communication handled via all-to-all or replicated computation.

Checkpoint format: - fc1.weight: [num_experts, expert_inner_dim * 2, hidden_size] (gate+up) - fc2.weight: [num_experts, hidden_size, expert_inner_dim] (down) - router.weight: [num_experts, hidden_size] - router.bias: [num_experts]

Source code in vllm/model_executor/models/moondream3.py
class Moondream3TextMoE(nn.Module):
    """Mixture of Experts layer for layers 4+ with expert parallelism.

    Moondream3 uses a custom GeGLU activation: gelu(h) * (g + 1)
    where fc1 outputs [gate, up] and the activation is gelu(gate) * (up + 1).

    Uses expert parallelism where each GPU stores num_experts/tp_size experts.
    Routing and communication handled via all-to-all or replicated computation.

    Checkpoint format:
    - fc1.weight: [num_experts, expert_inner_dim * 2, hidden_size] (gate+up)
    - fc2.weight: [num_experts, hidden_size, expert_inner_dim] (down)
    - router.weight: [num_experts, hidden_size]
    - router.bias: [num_experts]
    """

    def __init__(
        self,
        hidden_size: int,
        expert_inner_dim: int,
        num_experts: int,
        experts_per_token: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.expert_inner_dim = expert_inner_dim
        self.num_experts = num_experts
        self.experts_per_token = experts_per_token

        # Expert parallelism: each GPU stores a subset of experts
        self.tp_size = get_tensor_model_parallel_world_size()
        self.experts_per_rank = num_experts // self.tp_size
        self.num_local_experts = self.experts_per_rank

        # Router (gate) - use ReplicatedLinear for compatibility
        self.gate = ReplicatedLinear(
            hidden_size,
            num_experts,
            bias=True,
            quant_config=None,
            prefix=f"{prefix}.gate",
        )

        # Local expert weights (only store experts_per_rank experts)
        # fc1: [experts_per_rank, expert_inner_dim * 2, hidden_size]
        # fc2: [experts_per_rank, hidden_size, expert_inner_dim]
        self.fc1_weight = nn.Parameter(
            torch.empty(self.num_local_experts, expert_inner_dim * 2, hidden_size)
        )
        self.fc2_weight = nn.Parameter(
            torch.empty(self.num_local_experts, hidden_size, expert_inner_dim)
        )
        self._use_fused_moe = True

        local_expert_start = get_tensor_model_parallel_rank() * self.experts_per_rank
        expert_map = torch.full((num_experts,), -1, dtype=torch.int32)
        expert_map[local_expert_start : local_expert_start + self.num_local_experts] = (
            torch.arange(self.num_local_experts, dtype=torch.int32)
        )
        self.register_buffer("_expert_map", expert_map, persistent=False)

        # Preserve Moondream3's exact GeGLU variant (gelu(h) * (g + 1)) by
        # adding +1 bias to the second half of the fused fc1 activations.
        fused_w1_bias = torch.zeros(
            self.num_local_experts,
            expert_inner_dim * 2,
            dtype=torch.float32,
        )
        fused_w1_bias[:, expert_inner_dim:] = 1.0
        self.register_buffer("_fused_w1_bias", fused_w1_bias, persistent=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass with expert parallelism and custom GeGLU activation."""

        # Get router logits and compute top-k
        router_logits, _ = self.gate(x)  # [num_tokens, num_experts]
        topk_logits, topk_ids = torch.topk(
            router_logits, self.experts_per_token, dim=-1
        )
        # Softmax over selected experts
        topk_weights = F.softmax(topk_logits, dim=-1, dtype=torch.float32).to(x.dtype)

        if self._use_fused_moe and x.is_cuda:
            try:
                out = fused_experts(
                    hidden_states=x.contiguous(),
                    w1=self.fc1_weight,
                    w2=self.fc2_weight,
                    topk_weights=topk_weights.contiguous(),
                    topk_ids=topk_ids.contiguous(),
                    activation=MoEActivation.GELU,
                    global_num_experts=self.num_experts,
                    expert_map=self._expert_map,
                    quant_config=biased_moe_quant_config(self._fused_w1_bias, None),
                )
                out = tensor_model_parallel_all_reduce(out)
                return out
            except (NotImplementedError, RuntimeError) as exc:
                self._use_fused_moe = False
                logger.warning_once(
                    "Disabling fused Moondream3 MoE path and falling back to "
                    "the Python expert loop: %s",
                    str(exc),
                )

        tp_rank = get_tensor_model_parallel_rank()
        # Compute local expert range
        local_expert_start = tp_rank * self.experts_per_rank

        # Fallback path for environments where fused kernels are unavailable.
        out = x.new_zeros(x.shape)

        for local_expert_idx in range(self.num_local_experts):
            global_expert_id = local_expert_start + local_expert_idx

            # Find tokens assigned to this expert
            token_pos, which_k = (topk_ids == global_expert_id).nonzero(as_tuple=True)
            if token_pos.numel() == 0:
                continue

            # Get tokens and their routing weights
            x_tok = x.index_select(0, token_pos)  # [n_tokens, hidden_size]
            gate_tok = topk_weights[token_pos, which_k]  # [n_tokens]

            # fc1: [expert_inner_dim * 2, hidden_size]
            # h_full: [n_tokens, expert_inner_dim * 2]
            h_full = F.linear(x_tok, self.fc1_weight[local_expert_idx])

            # GeGLU with (g + 1): h, g = split; output = gelu(h) * (g + 1)
            # HF MoE uses exact GELU (not tanh approximation).
            h, g = h_full.chunk(2, dim=-1)  # Each [n_tokens, expert_inner_dim]
            h = F.gelu(h) * (g + 1.0)

            # fc2: [hidden_size, expert_inner_dim]
            # y: [n_tokens, hidden_size]
            y = F.linear(h, self.fc2_weight[local_expert_idx])

            # Apply routing weight
            y = y * gate_tok.unsqueeze(-1)

            # Accumulate output
            out.index_add_(0, token_pos, y)

        # All-reduce to combine results from all experts across GPUs
        out = tensor_model_parallel_all_reduce(out)

        return out

forward

forward(x: Tensor) -> Tensor

Forward pass with expert parallelism and custom GeGLU activation.

Source code in vllm/model_executor/models/moondream3.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Forward pass with expert parallelism and custom GeGLU activation."""

    # Get router logits and compute top-k
    router_logits, _ = self.gate(x)  # [num_tokens, num_experts]
    topk_logits, topk_ids = torch.topk(
        router_logits, self.experts_per_token, dim=-1
    )
    # Softmax over selected experts
    topk_weights = F.softmax(topk_logits, dim=-1, dtype=torch.float32).to(x.dtype)

    if self._use_fused_moe and x.is_cuda:
        try:
            out = fused_experts(
                hidden_states=x.contiguous(),
                w1=self.fc1_weight,
                w2=self.fc2_weight,
                topk_weights=topk_weights.contiguous(),
                topk_ids=topk_ids.contiguous(),
                activation=MoEActivation.GELU,
                global_num_experts=self.num_experts,
                expert_map=self._expert_map,
                quant_config=biased_moe_quant_config(self._fused_w1_bias, None),
            )
            out = tensor_model_parallel_all_reduce(out)
            return out
        except (NotImplementedError, RuntimeError) as exc:
            self._use_fused_moe = False
            logger.warning_once(
                "Disabling fused Moondream3 MoE path and falling back to "
                "the Python expert loop: %s",
                str(exc),
            )

    tp_rank = get_tensor_model_parallel_rank()
    # Compute local expert range
    local_expert_start = tp_rank * self.experts_per_rank

    # Fallback path for environments where fused kernels are unavailable.
    out = x.new_zeros(x.shape)

    for local_expert_idx in range(self.num_local_experts):
        global_expert_id = local_expert_start + local_expert_idx

        # Find tokens assigned to this expert
        token_pos, which_k = (topk_ids == global_expert_id).nonzero(as_tuple=True)
        if token_pos.numel() == 0:
            continue

        # Get tokens and their routing weights
        x_tok = x.index_select(0, token_pos)  # [n_tokens, hidden_size]
        gate_tok = topk_weights[token_pos, which_k]  # [n_tokens]

        # fc1: [expert_inner_dim * 2, hidden_size]
        # h_full: [n_tokens, expert_inner_dim * 2]
        h_full = F.linear(x_tok, self.fc1_weight[local_expert_idx])

        # GeGLU with (g + 1): h, g = split; output = gelu(h) * (g + 1)
        # HF MoE uses exact GELU (not tanh approximation).
        h, g = h_full.chunk(2, dim=-1)  # Each [n_tokens, expert_inner_dim]
        h = F.gelu(h) * (g + 1.0)

        # fc2: [hidden_size, expert_inner_dim]
        # y: [n_tokens, hidden_size]
        y = F.linear(h, self.fc2_weight[local_expert_idx])

        # Apply routing weight
        y = y * gate_tok.unsqueeze(-1)

        # Accumulate output
        out.index_add_(0, token_pos, y)

    # All-reduce to combine results from all experts across GPUs
    out = tensor_model_parallel_all_reduce(out)

    return out

Moondream3TextModel

Bases: Module

Text decoder model.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3TextModel(nn.Module):
    """Text decoder model."""

    def __init__(
        self,
        config: Moondream3TextConfig,
        cache_config=None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.config = config

        self.wte = VocabParallelEmbedding(
            config.vocab_size,
            config.dim,
            prefix=f"{prefix}.wte",
        )

        blocks_prefix = maybe_prefix(prefix, "blocks")
        self.start_layer, self.end_layer, self.blocks = make_layers(
            config.n_layers,
            lambda prefix: Moondream3DecoderLayer(
                config=config,
                cache_config=cache_config,
                quant_config=quant_config,
                prefix=prefix,
            ),
            prefix=blocks_prefix,
        )

        self.post_ln = nn.LayerNorm(config.dim, eps=1e-5, bias=True)
        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
            ["hidden_states"], config.dim
        )

    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.wte(input_ids)

    def forward(
        self,
        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
        pp_group = get_pp_group()
        if pp_group.is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                assert input_ids is not None
                hidden_states = self.embed_input_ids(input_ids)
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]

        for i, layer in enumerate(
            islice(self.blocks, self.start_layer, self.end_layer)
        ):
            hidden_states = layer(positions, hidden_states)

        if not pp_group.is_last_rank:
            return IntermediateTensors({"hidden_states": hidden_states})

        hidden_states = self.post_ln(hidden_states)
        return hidden_states

Moondream3VisionAttention

Bases: Module

Self-attention for vision encoder (bidirectional).

Source code in vllm/model_executor/models/moondream3.py
class Moondream3VisionAttention(nn.Module):
    """Self-attention for vision encoder (bidirectional)."""

    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads

        self.qkv_proj = QKVParallelLinear(
            hidden_size=hidden_size,
            head_size=self.head_dim,
            total_num_heads=num_heads,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )
        self.out_proj = RowParallelLinear(
            input_size=hidden_size,
            output_size=hidden_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.out_proj",
        )

        tp_size = get_tensor_model_parallel_world_size()
        self.num_heads_per_partition = num_heads // tp_size

        self.attn = MMEncoderAttention(
            num_heads=self.num_heads_per_partition,
            head_size=self.head_dim,
            scale=self.head_dim**-0.5,
            prefix=f"{prefix}.attn",
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.chunk(3, dim=-1)
        out = self.attn(q, k, v)
        out, _ = self.out_proj(out)
        return out

Moondream3VisionBlock

Bases: Module

Transformer block for vision encoder.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3VisionBlock(nn.Module):
    """Transformer block for vision encoder."""

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        num_heads: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.ln1 = nn.LayerNorm(hidden_size, eps=1e-5)
        self.attn = Moondream3VisionAttention(
            hidden_size=hidden_size,
            num_heads=num_heads,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )
        self.ln2 = nn.LayerNorm(hidden_size, eps=1e-5)
        self.mlp = Moondream3VisionMLP(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            quant_config=quant_config,
            prefix=f"{prefix}.mlp",
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

Moondream3VisionEncoder

Bases: Module

Vision encoder (SigLIP-style ViT).

Source code in vllm/model_executor/models/moondream3.py
class Moondream3VisionEncoder(nn.Module):
    """Vision encoder (SigLIP-style ViT)."""

    def __init__(
        self,
        config: Moondream3VisionConfig,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.config = config

        # Patch embedding
        self.patch_emb = nn.Linear(
            config.enc_patch_size * config.enc_patch_size * 3,
            config.enc_dim,
            bias=True,
        )

        # Position embeddings (27x27 = 729 patches for 378x378 / 14)
        num_patches = (config.crop_size // config.enc_patch_size) ** 2
        self.pos_emb = nn.Parameter(torch.zeros(1, num_patches, config.enc_dim))

        # Transformer blocks
        self.blocks = nn.ModuleList(
            [
                Moondream3VisionBlock(
                    hidden_size=config.enc_dim,
                    intermediate_size=config.enc_ff_dim,
                    num_heads=config.enc_n_heads,
                    quant_config=quant_config,
                    prefix=f"{prefix}.blocks.{i}",
                )
                for i in range(config.enc_n_layers)
            ]
        )

        self.post_ln = nn.LayerNorm(config.enc_dim, eps=1e-5)

    def create_patches(self, images: torch.Tensor) -> torch.Tensor:
        """Convert images to patch embeddings.

        Args:
            images: (batch, channels, height, width)

        Returns:
            patches: (batch, num_patches, patch_dim)
        """
        patch_size = self.config.enc_patch_size
        batch, channels, height, width = images.shape
        patches_h = height // patch_size
        patches_w = width // patch_size

        # Unfold into patches
        patches = images.unfold(2, patch_size, patch_size).unfold(
            3, patch_size, patch_size
        )
        # (batch, channels, patches_h, patches_w, patch_size, patch_size)
        patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
        # (batch, patches_h, patches_w, channels, patch_size, patch_size)
        patches = patches.view(batch, patches_h * patches_w, -1)
        # (batch, num_patches, channels * patch_size * patch_size)

        return patches

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        """Encode images.

        Args:
            pixel_values: (batch, channels, height, width)

        Returns:
            features: (batch, num_patches, hidden_size)
        """
        # Create patches and embed
        patches = self.create_patches(pixel_values)
        x = self.patch_emb(patches)

        # Add position embeddings
        x = x + self.pos_emb

        # Apply transformer blocks
        for block in self.blocks:
            x = block(x)

        # Final layer norm
        x = self.post_ln(x)

        return x

create_patches

create_patches(images: Tensor) -> Tensor

Convert images to patch embeddings.

Parameters:

Name Type Description Default
images Tensor

(batch, channels, height, width)

required

Returns:

Name Type Description
patches Tensor

(batch, num_patches, patch_dim)

Source code in vllm/model_executor/models/moondream3.py
def create_patches(self, images: torch.Tensor) -> torch.Tensor:
    """Convert images to patch embeddings.

    Args:
        images: (batch, channels, height, width)

    Returns:
        patches: (batch, num_patches, patch_dim)
    """
    patch_size = self.config.enc_patch_size
    batch, channels, height, width = images.shape
    patches_h = height // patch_size
    patches_w = width // patch_size

    # Unfold into patches
    patches = images.unfold(2, patch_size, patch_size).unfold(
        3, patch_size, patch_size
    )
    # (batch, channels, patches_h, patches_w, patch_size, patch_size)
    patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
    # (batch, patches_h, patches_w, channels, patch_size, patch_size)
    patches = patches.view(batch, patches_h * patches_w, -1)
    # (batch, num_patches, channels * patch_size * patch_size)

    return patches

forward

forward(pixel_values: Tensor) -> Tensor

Encode images.

Parameters:

Name Type Description Default
pixel_values Tensor

(batch, channels, height, width)

required

Returns:

Name Type Description
features Tensor

(batch, num_patches, hidden_size)

Source code in vllm/model_executor/models/moondream3.py
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
    """Encode images.

    Args:
        pixel_values: (batch, channels, height, width)

    Returns:
        features: (batch, num_patches, hidden_size)
    """
    # Create patches and embed
    patches = self.create_patches(pixel_values)
    x = self.patch_emb(patches)

    # Add position embeddings
    x = x + self.pos_emb

    # Apply transformer blocks
    for block in self.blocks:
        x = block(x)

    # Final layer norm
    x = self.post_ln(x)

    return x

Moondream3VisionMLP

Bases: Module

MLP for vision encoder blocks.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3VisionMLP(nn.Module):
    """MLP for vision encoder blocks."""

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        self.fc1 = ColumnParallelLinear(
            hidden_size,
            intermediate_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc1",
        )
        self.act = get_act_fn("gelu_pytorch_tanh")
        self.fc2 = RowParallelLinear(
            intermediate_size,
            hidden_size,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc2",
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x, _ = self.fc1(x)
        x = self.act(x)
        x, _ = self.fc2(x)
        return x

Moondream3VisionProjection

Bases: Module

Projects vision features to text embedding dimension.

Source code in vllm/model_executor/models/moondream3.py
class Moondream3VisionProjection(nn.Module):
    """Projects vision features to text embedding dimension."""

    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        output_dim: int,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ):
        super().__init__()
        # Input is concatenated global and local features (2 * input_dim)
        self.fc1 = ColumnParallelLinear(
            input_dim * 2,
            inner_dim,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc1",
        )
        self.act = get_act_fn("gelu_pytorch_tanh")
        self.fc2 = RowParallelLinear(
            inner_dim,
            output_dim,
            bias=True,
            quant_config=quant_config,
            prefix=f"{prefix}.fc2",
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x, _ = self.fc1(x)
        x = self.act(x)
        x, _ = self.fc2(x)
        return x

reconstruct_from_crops

reconstruct_from_crops(
    crops: Tensor,
    tiling: tuple[int, int],
    overlap_margin: int,
    patch_size: int = 14,
) -> Tensor

Reconstruct features from overlapping crops.

Source code in vllm/model_executor/models/moondream3.py
def reconstruct_from_crops(
    crops: torch.Tensor,
    tiling: tuple[int, int],
    overlap_margin: int,
    patch_size: int = 14,
) -> torch.Tensor:
    """Reconstruct features from overlapping crops."""
    tiling_h, tiling_w = tiling
    crop_height, crop_width = crops[0].shape[:2]
    margin_pixels = overlap_margin * patch_size

    output_h = (crop_height - 2 * margin_pixels) * tiling_h + 2 * margin_pixels
    output_w = (crop_width - 2 * margin_pixels) * tiling_w + 2 * margin_pixels

    reconstructed = torch.zeros(
        (output_h, output_w, crops[0].shape[2]),
        device=crops[0].device,
        dtype=crops[0].dtype,
    )

    for i, crop in enumerate(crops):
        tile_y = i // tiling_w
        tile_x = i % tiling_w

        x_start = 0 if tile_x == 0 else margin_pixels
        x_end = crop_width if tile_x == tiling_w - 1 else crop_width - margin_pixels
        y_start = 0 if tile_y == 0 else margin_pixels
        y_end = crop_height if tile_y == tiling_h - 1 else crop_height - margin_pixels

        out_x = tile_x * (crop_width - 2 * margin_pixels)
        out_y = tile_y * (crop_height - 2 * margin_pixels)

        reconstructed[
            out_y + y_start : out_y + y_end, out_x + x_start : out_x + x_end
        ] = crop[y_start:y_end, x_start:x_end]

    return reconstructed