class Moondream3Processor(ProcessorMixin):
"""
Constructs a Moondream3 processor which handles image preprocessing
and tokenization for the Moondream3 multimodal model.
Args:
tokenizer: The tokenizer to use for text processing.
chat_template: Optional chat template string.
crop_size: Size of each image crop.
max_crops: Maximum number of crops per image.
overlap_margin: Margin for overlapping crops in patches.
patch_size: Size of each patch.
"""
attributes = ["tokenizer"]
valid_kwargs = [
"chat_template",
"crop_size",
"max_crops",
"overlap_margin",
"patch_size",
]
tokenizer_class = "AutoTokenizer"
# Use separate tokenizer repo
_tokenizer_repo = "moondream/starmie-v1"
# Default chat template for Moondream3
# Moondream uses special tokens for prompting:
# - Token 0 (<|endoftext|>): BOS token (ALWAYS present at position 0)
# - Token 1 (<|md_reserved_0|>): Start of instruction
# - Token 2 (<|md_reserved_1|>): Separator before question
# - Token 3 (<|md_reserved_2|>): End of question / start of answer
#
# Task routing based on text prefix:
# "caption [short|normal|long]" → describe<|md_reserved_1|>{length}
# "describe [short|normal|long]" → describe<|md_reserved_1|>{length}
# otherwise → query<|md_reserved_1|><text>
#
# Format with image:
# <|endoftext|><image><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
# Format without image:
# <|endoftext|><|md_reserved_0|>{task}<|md_reserved_1|>{q}<|md_reserved_2|>
_default_chat_template = (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}"
"{% if message['content'] is string %}"
# Simple string content (with image assumed) - route by prefix
"<|endoftext|><image><|md_reserved_0|>"
"{% if message['content'] == 'caption' %}"
"describe<|md_reserved_1|>normal<|md_reserved_2|>"
"{% elif message['content'].startswith('caption ') %}"
"describe<|md_reserved_1|>{{ message['content'][8:] }}<|md_reserved_2|>"
"{% elif message['content'] == 'describe' %}"
"describe<|md_reserved_1|>normal<|md_reserved_2|>"
"{% elif message['content'].startswith('describe ') %}"
"describe<|md_reserved_1|>{{ message['content'][9:] }}<|md_reserved_2|>"
"{% else %}"
"query<|md_reserved_1|>{{ message['content'] }}<|md_reserved_2|>"
"{% endif %}"
"{% else %}"
# List content - build Moondream's image prefix independently of
# OpenAI-style content part order, then render the text task.
"<|endoftext|>"
"{% for content in message['content'] %}"
"{% if content['type'] in ['image', 'image_url', 'input_image', 'image_pil'] %}" # noqa: E501
"<image>"
"{% endif %}"
"{% endfor %}"
"{% for content in message['content'] %}"
"{% if content['type'] == 'text' %}"
"<|md_reserved_0|>"
"{% if content['text'] == 'caption' %}"
"describe<|md_reserved_1|>normal<|md_reserved_2|>"
"{% elif content['text'].startswith('caption ') %}"
"describe<|md_reserved_1|>{{ content['text'][8:] }}<|md_reserved_2|>"
"{% elif content['text'] == 'describe' %}"
"describe<|md_reserved_1|>normal<|md_reserved_2|>"
"{% elif content['text'].startswith('describe ') %}"
"describe<|md_reserved_1|>{{ content['text'][9:] }}<|md_reserved_2|>"
"{% else %}"
"query<|md_reserved_1|>{{ content['text'] }}<|md_reserved_2|>"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
"{% elif message['role'] == 'assistant' %}"
"{{ message['content'] }}"
"{% endif %}"
"{% endfor %}"
)
def __init__(
self,
tokenizer: PreTrainedTokenizerBase | None = None,
chat_template: str | None = None,
crop_size: int = 378,
max_crops: int = 12,
overlap_margin: int = 4,
patch_size: int = 14,
**kwargs,
):
self.image_token = "<image>"
self.crop_size = crop_size
self.max_crops = max_crops
self.overlap_margin = overlap_margin
self.patch_size = patch_size
# Number of patches per crop (27x27 = 729 for 378/14)
self.patches_per_crop = (crop_size // patch_size) ** 2
# Use default chat template if none provided
if chat_template is None:
chat_template = self._default_chat_template
super().__init__(tokenizer, chat_template=chat_template)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
**kwargs,
):
"""
Load the processor, using a separate tokenizer repo.
The moondream3 model uses a custom tokenizer from 'moondream/starmie-v1'
instead of having tokenizer files in the model repo.
"""
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers.utils import cached_file
tokenizer = kwargs.pop("tokenizer", None)
tokenizer_kwargs = {
"trust_remote_code": kwargs.get("trust_remote_code", False),
}
for key in (
"cache_dir",
"force_download",
"local_files_only",
"revision",
"subfolder",
"token",
"use_fast",
):
if key in kwargs:
tokenizer_kwargs[key] = kwargs[key]
cached_file_kwargs = {
key: tokenizer_kwargs[key]
for key in (
"cache_dir",
"force_download",
"local_files_only",
"revision",
"subfolder",
"token",
)
if key in tokenizer_kwargs
}
def load_tokenizer(repo_or_path):
try:
return AutoTokenizer.from_pretrained(repo_or_path, **tokenizer_kwargs)
except Exception:
tokenizer_file = cached_file(
repo_or_path,
"tokenizer.json",
**cached_file_kwargs,
)
return PreTrainedTokenizerFast(
tokenizer_file=tokenizer_file,
clean_up_tokenization_spaces=False,
)
if isinstance(tokenizer, str):
tokenizer = load_tokenizer(tokenizer)
if tokenizer is None:
# Prefer model-local tokenizer files first. If unavailable, fall
# back to moondream's dedicated tokenizer repository.
try:
tokenizer = load_tokenizer(pretrained_model_name_or_path)
except Exception:
tokenizer = load_tokenizer(cls._tokenizer_repo)
# Configure special tokens for Moondream3
# BOS and EOS are both token 0 (<|endoftext|>), matching the native
# config (TokenizerConfig.bos_id=0, eos_id=0). This is standard for
# GPT-2 style models where <|endoftext|> signals both start and end.
# Token 1 (<|md_reserved_0|>) is a template delimiter, NOT the EOS.
tokenizer.bos_token = "<|endoftext|>"
tokenizer.bos_token_id = 0
tokenizer.eos_token = "<|endoftext|>"
tokenizer.eos_token_id = 0
# Extract processor-specific kwargs
crop_size = kwargs.pop("crop_size", 378)
max_crops = kwargs.pop("max_crops", 12)
overlap_margin = kwargs.pop("overlap_margin", 4)
patch_size = kwargs.pop("patch_size", 14)
chat_template = kwargs.pop("chat_template", None)
# Set default chat template on tokenizer if not already set
if chat_template is None:
chat_template = cls._default_chat_template
if tokenizer.chat_template is None:
tokenizer.chat_template = chat_template
return cls(
tokenizer=tokenizer,
chat_template=chat_template,
crop_size=crop_size,
max_crops=max_crops,
overlap_margin=overlap_margin,
patch_size=patch_size,
)
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
**kwargs: Unpack[Moondream3ProcessorKwargs],
) -> BatchFeature:
"""
Process images and text for Moondream3 model.
Args:
images: Input images (PIL Image, numpy array, or list thereof).
text: Input text or list of texts.
**kwargs: Additional processing arguments.
Returns:
BatchFeature with processed inputs.
"""
output_kwargs = self._merge_kwargs(
Moondream3ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Process images
image_features = {}
if images is not None:
processed_images = []
tilings = []
images_list = images if isinstance(images, list) else [images]
for image in images_list:
pixel_values, tiling = self.preprocess_image(
image, **output_kwargs["images_kwargs"]
)
processed_images.append(pixel_values)
tilings.append(tiling)
if processed_images:
image_features["pixel_values"] = processed_images
image_features["tilings"] = tilings
# Process text
if text is not None:
if not isinstance(text, list):
text = [text]
# Get text kwargs, remove keys we set ourselves
text_kwargs = output_kwargs.get("text_kwargs", {}).copy()
text_kwargs.pop("return_tensors", None)
text_kwargs.pop("add_special_tokens", None)
# Tokenize text
tokenized = self.tokenizer(
text,
add_special_tokens=True,
return_tensors="pt",
**text_kwargs,
)
output = BatchFeature(data=dict(tokenized))
# Add image features
if image_features:
output["pixel_values"] = image_features["pixel_values"]
output["tilings"] = image_features["tilings"]
return output
# If only images were provided
return BatchFeature(data=image_features)
@staticmethod
def _image_array_to_uint8(array: np.ndarray) -> np.ndarray:
if array.dtype == np.uint8:
return np.ascontiguousarray(array)
if array.dtype == np.bool_:
return np.ascontiguousarray(array.astype(np.uint8) * 255)
if np.issubdtype(array.dtype, np.floating):
array = np.nan_to_num(array, nan=0.0, posinf=255.0, neginf=0.0)
if array.size > 0 and array.max() <= 1.0:
array = array * 255.0
array = np.rint(array)
return np.ascontiguousarray(np.clip(array, 0, 255).astype(np.uint8))
@staticmethod
def _to_pil_image(image: ImageInput) -> Image.Image:
if isinstance(image, Image.Image):
return image
if isinstance(image, torch.Tensor):
tensor = image.detach().cpu()
if tensor.dtype == torch.bfloat16:
tensor = tensor.to(torch.float32)
image_array = tensor.numpy()
elif isinstance(image, np.ndarray):
image_array = image
else:
raise TypeError(
"Moondream3 images must be PIL images, numpy arrays, "
f"or torch tensors, got {type(image)!r}."
)
if image_array.ndim == 2:
image_array = Moondream3Processor._image_array_to_uint8(image_array)
return Image.fromarray(image_array)
if image_array.ndim != 3:
raise ValueError(
"Moondream3 image arrays must have 2 or 3 dimensions, "
f"got shape {image_array.shape}."
)
channel_dims = (1, 3, 4)
if image_array.shape[-1] not in channel_dims:
if image_array.shape[0] not in channel_dims:
raise ValueError(
"Moondream3 image arrays must be HWC or CHW with 1, 3, "
f"or 4 channels, got shape {image_array.shape}."
)
image_array = np.transpose(image_array, (1, 2, 0))
image_array = Moondream3Processor._image_array_to_uint8(image_array)
if image_array.shape[-1] == 1:
image_array = image_array[..., 0]
return Image.fromarray(image_array)
def preprocess_image(
self,
image: ImageInput,
max_crops: int = 12,
overlap_margin: int = 4,
crop_size: int = 378,
patch_size: int = 14,
convert_to_rgb: bool = True,
return_tensors: str = "pt",
) -> tuple[torch.Tensor, tuple[int, int]]:
"""
Preprocess an image using overlap-and-resize cropping strategy.
Args:
image: Input PIL image, numpy array, or torch tensor.
max_crops: Maximum number of crops.
overlap_margin: Margin for overlapping in patches.
crop_size: Size of each crop.
patch_size: Size of each patch.
convert_to_rgb: Whether to convert to RGB.
return_tensors: Return type ("pt" for PyTorch).
Returns:
Tuple of (pixel_values tensor, tiling tuple).
"""
image = self._to_pil_image(image)
if convert_to_rgb:
image = convert_image_mode(image, "RGB")
# Convert to numpy array
image_array = np.array(image)
original_h, original_w = image_array.shape[:2]
margin_pixels = patch_size * overlap_margin
total_margin_pixels = margin_pixels * 2
crop_patches = crop_size // patch_size
crop_window_patches = crop_patches - (2 * overlap_margin)
crop_window_size = crop_window_patches * patch_size
tiling = select_tiling(
original_h - total_margin_pixels,
original_w - total_margin_pixels,
crop_window_size,
max_crops,
)
n_crops = tiling[0] * tiling[1] + 1
crops = np.zeros((n_crops, crop_size, crop_size, 3), dtype=np.uint8)
target_size = (
tiling[0] * crop_window_size + total_margin_pixels,
tiling[1] * crop_window_size + total_margin_pixels,
)
# Resize image
pil_img = Image.fromarray(image_array)
resized = pil_img.resize(
(int(target_size[1]), int(target_size[0])),
resample=Image.Resampling.LANCZOS,
)
resized_array = np.asarray(resized)
# Create global crop
global_pil = pil_img.resize(
(crop_size, crop_size), resample=Image.Resampling.LANCZOS
)
crops[0] = np.asarray(global_pil)
# Create local crops
for i in range(tiling[0]):
for j in range(tiling[1]):
y0 = i * crop_window_size
x0 = j * crop_window_size
y_end = min(y0 + crop_size, resized_array.shape[0])
x_end = min(x0 + crop_size, resized_array.shape[1])
crop_region = resized_array[y0:y_end, x0:x_end]
crop_idx = 1 + i * tiling[1] + j
h_slice = slice(None, crop_region.shape[0])
w_slice = slice(None, crop_region.shape[1])
crops[crop_idx, h_slice, w_slice] = crop_region
# Convert to tensor: (n_crops, H, W, C) -> (n_crops, C, H, W)
pixel_values = np.transpose(crops, (0, 3, 1, 2))
if return_tensors == "pt":
# Match HF reference preprocessing exactly: convert uint8 crops to
# bfloat16 before in-place normalization.
pixel_values = (
torch.from_numpy(pixel_values)
.to(dtype=torch.bfloat16)
.div_(255.0)
.sub_(0.5)
.div_(0.5)
)
else:
pixel_values = pixel_values.astype(np.float32) / 255.0
pixel_values = (pixel_values - 0.5) / 0.5
return pixel_values, tiling
def get_num_image_tokens(self) -> int:
"""Return the number of image tokens (729 = 27x27 patches)."""
return self.patches_per_crop
def batch_decode(self, *args, **kwargs):
"""Forward to tokenizer's batch_decode."""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""Forward to tokenizer's decode."""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
return tokenizer_input_names + ["pixel_values", "tilings"]