vllm.v1.worker.gpu.mm.encoder_cache ¶

EncoderCache ¶

Source code in vllm/v1/worker/gpu/mm/encoder_cache.py

class EncoderCache:
    def __init__(self):
        # req_id -> MM features
        self.mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
        # MM hash -> encoder outputs
        self.encoder_outputs: dict[str, torch.Tensor] = {}

    def add_request(
        self, req_id: str, mm_features: list[MultiModalFeatureSpec]
    ) -> None:
        self.mm_features[req_id] = mm_features

    def remove_request(self, req_id: str) -> None:
        self.mm_features.pop(req_id, None)

    def reset_mm_cache(self) -> None:
        """
        Clear the multi-modal cache that was used during profiling,
        but no longer needed during inference.
        """
        # TODO: Implement MM budget for encoder dummy run
        pass

    def reset_encoder_cache(self) -> None:
        """Clear the GPU-side encoder cache storing vision embeddings.

        This should be called when model weights are updated to ensure
        stale embeddings computed with old weights are not reused.
        """
        self.encoder_outputs.clear()

    def free_encoder_cache(self, mm_hash: str) -> None:
        self.encoder_outputs.pop(mm_hash, None)

reset_encoder_cache ¶

reset_encoder_cache() -> None

Clear the GPU-side encoder cache storing vision embeddings.

This should be called when model weights are updated to ensure stale embeddings computed with old weights are not reused.

Source code in vllm/v1/worker/gpu/mm/encoder_cache.py

def reset_encoder_cache(self) -> None:
    """Clear the GPU-side encoder cache storing vision embeddings.

    This should be called when model weights are updated to ensure
    stale embeddings computed with old weights are not reused.
    """
    self.encoder_outputs.clear()

reset_mm_cache ¶

reset_mm_cache() -> None

Clear the multi-modal cache that was used during profiling, but no longer needed during inference.

Source code in vllm/v1/worker/gpu/mm/encoder_cache.py

def reset_mm_cache(self) -> None:
    """
    Clear the multi-modal cache that was used during profiling,
    but no longer needed during inference.
    """
    # TODO: Implement MM budget for encoder dummy run
    pass