Skip to content

vllm.config.offload

Configuration for model weight offloading.

OffloadConfig

Configuration for model weight offloading to reduce GPU memory usage.

Source code in vllm/config/offload.py
@config
class OffloadConfig:
    """Configuration for model weight offloading to reduce GPU memory usage."""

    offload_backend: OffloadBackend = "auto"
    """The backend for weight offloading. Options:
    - "auto": Selects based on which sub-config has non-default values
      (prefetch if offload_group_size > 0, uva if cpu_offload_gb > 0).
    - "uva": UVA (Unified Virtual Addressing) zero-copy offloading.
    - "prefetch": Async prefetch with group-based layer offloading.
    """

    uva: UVAOffloadConfig = Field(default_factory=UVAOffloadConfig)
    """Parameters for UVA offloading backend."""

    prefetch: PrefetchOffloadConfig = Field(default_factory=PrefetchOffloadConfig)
    """Parameters for prefetch offloading backend."""

    @model_validator(mode="after")
    def validate_offload_config(self) -> "OffloadConfig":
        """Validate offload configuration constraints."""
        if self.offload_backend == "prefetch" or self.prefetch.offload_group_size > 0:
            if self.prefetch.offload_num_in_group > self.prefetch.offload_group_size:
                raise ValueError(
                    f"offload_num_in_group ({self.prefetch.offload_num_in_group})"
                    f" must be <= offload_group_size"
                    f" ({self.prefetch.offload_group_size})"
                )
            if self.prefetch.offload_prefetch_step < 1:
                raise ValueError(
                    f"offload_prefetch_step"
                    f" ({self.prefetch.offload_prefetch_step})"
                    f" must be >= 1 when prefetch offloading is enabled"
                    f" (offload_group_size > 0)"
                )

        # Warn if both backends have non-default values
        uva_active = self.uva.cpu_offload_gb > 0
        prefetch_active = self.prefetch.offload_group_size > 0
        if self.offload_backend == "uva" and prefetch_active:
            warnings.warn(
                "Prefetch offload fields are set but offload_backend='uva'. "
                "Prefetch settings will be ignored.",
                stacklevel=2,
            )
        elif self.offload_backend == "prefetch" and uva_active:
            warnings.warn(
                "UVA offload fields are set but offload_backend='prefetch'. "
                "UVA settings will be ignored.",
                stacklevel=2,
            )
        elif self.offload_backend == "auto" and uva_active and prefetch_active:
            warnings.warn(
                "Both UVA and prefetch offload fields are set with "
                "offload_backend='auto'. Prefetch backend will be selected. "
                "Set offload_backend explicitly to suppress this warning.",
                stacklevel=2,
            )
        return self

    def compute_hash(self) -> str:
        """
        Provide a hash that uniquely identifies all the offload configs.

        All fields are included because PrefetchOffloader patches module
        forwards and inserts custom ops (wait_prefetch, start_prefetch)
        into the computation graph. Changing any offload setting can
        alter which layers are hooked and how prefetch indices are
        computed, so the compilation cache must distinguish them.
        """
        from vllm.config.utils import get_hash_factors, hash_factors

        factors = get_hash_factors(self, ignored_factors=set())
        hash_str = hash_factors(factors)
        return hash_str

offload_backend class-attribute instance-attribute

offload_backend: OffloadBackend = 'auto'

The backend for weight offloading. Options: - "auto": Selects based on which sub-config has non-default values (prefetch if offload_group_size > 0, uva if cpu_offload_gb > 0). - "uva": UVA (Unified Virtual Addressing) zero-copy offloading. - "prefetch": Async prefetch with group-based layer offloading.

prefetch class-attribute instance-attribute

prefetch: PrefetchOffloadConfig = Field(
    default_factory=PrefetchOffloadConfig
)

Parameters for prefetch offloading backend.

uva class-attribute instance-attribute

uva: UVAOffloadConfig = Field(
    default_factory=UVAOffloadConfig
)

Parameters for UVA offloading backend.

compute_hash

compute_hash() -> str

Provide a hash that uniquely identifies all the offload configs.

All fields are included because PrefetchOffloader patches module forwards and inserts custom ops (wait_prefetch, start_prefetch) into the computation graph. Changing any offload setting can alter which layers are hooked and how prefetch indices are computed, so the compilation cache must distinguish them.

Source code in vllm/config/offload.py
def compute_hash(self) -> str:
    """
    Provide a hash that uniquely identifies all the offload configs.

    All fields are included because PrefetchOffloader patches module
    forwards and inserts custom ops (wait_prefetch, start_prefetch)
    into the computation graph. Changing any offload setting can
    alter which layers are hooked and how prefetch indices are
    computed, so the compilation cache must distinguish them.
    """
    from vllm.config.utils import get_hash_factors, hash_factors

    factors = get_hash_factors(self, ignored_factors=set())
    hash_str = hash_factors(factors)
    return hash_str

validate_offload_config

validate_offload_config() -> OffloadConfig

Validate offload configuration constraints.

Source code in vllm/config/offload.py
@model_validator(mode="after")
def validate_offload_config(self) -> "OffloadConfig":
    """Validate offload configuration constraints."""
    if self.offload_backend == "prefetch" or self.prefetch.offload_group_size > 0:
        if self.prefetch.offload_num_in_group > self.prefetch.offload_group_size:
            raise ValueError(
                f"offload_num_in_group ({self.prefetch.offload_num_in_group})"
                f" must be <= offload_group_size"
                f" ({self.prefetch.offload_group_size})"
            )
        if self.prefetch.offload_prefetch_step < 1:
            raise ValueError(
                f"offload_prefetch_step"
                f" ({self.prefetch.offload_prefetch_step})"
                f" must be >= 1 when prefetch offloading is enabled"
                f" (offload_group_size > 0)"
            )

    # Warn if both backends have non-default values
    uva_active = self.uva.cpu_offload_gb > 0
    prefetch_active = self.prefetch.offload_group_size > 0
    if self.offload_backend == "uva" and prefetch_active:
        warnings.warn(
            "Prefetch offload fields are set but offload_backend='uva'. "
            "Prefetch settings will be ignored.",
            stacklevel=2,
        )
    elif self.offload_backend == "prefetch" and uva_active:
        warnings.warn(
            "UVA offload fields are set but offload_backend='prefetch'. "
            "UVA settings will be ignored.",
            stacklevel=2,
        )
    elif self.offload_backend == "auto" and uva_active and prefetch_active:
        warnings.warn(
            "Both UVA and prefetch offload fields are set with "
            "offload_backend='auto'. Prefetch backend will be selected. "
            "Set offload_backend explicitly to suppress this warning.",
            stacklevel=2,
        )
    return self

PrefetchOffloadConfig

Configuration for prefetch-based CPU offloading.

Groups layers and uses async H2D prefetch to hide transfer latency.

Source code in vllm/config/offload.py
@config
class PrefetchOffloadConfig:
    """Configuration for prefetch-based CPU offloading.

    Groups layers and uses async H2D prefetch to hide transfer latency.
    """

    offload_group_size: int = Field(default=0, ge=0)
    """Group every N layers together. Offload last `offload_num_in_group`
    layers of each group. Default is 0 (disabled).
    Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,...
    Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer
    latency.
    """

    offload_num_in_group: int = Field(default=1, ge=1)
    """Number of layers to offload per group.
    Must be <= offload_group_size. Default is 1."""

    offload_prefetch_step: int = Field(default=1, ge=0)
    """Number of layers to prefetch ahead.
    Higher values hide more latency but use more GPU memory. Default is 1."""

    offload_params: set[str] = Field(default_factory=set)
    """The set of parameter name segments to target for prefetch offloading.
    Unmatched parameters are not offloaded. If this set is empty, ALL
    parameters of each offloaded layer are offloaded.
    Uses segment matching: "w13_weight" matches "mlp.experts.w13_weight"
    but not "mlp.experts.w13_weight_scale".
    """

offload_group_size class-attribute instance-attribute

offload_group_size: int = Field(default=0, ge=0)

Group every N layers together. Offload last offload_num_in_group layers of each group. Default is 0 (disabled). Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,... Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer latency.

offload_num_in_group class-attribute instance-attribute

offload_num_in_group: int = Field(default=1, ge=1)

Number of layers to offload per group. Must be <= offload_group_size. Default is 1.

offload_params class-attribute instance-attribute

offload_params: set[str] = Field(default_factory=set)

The set of parameter name segments to target for prefetch offloading. Unmatched parameters are not offloaded. If this set is empty, ALL parameters of each offloaded layer are offloaded. Uses segment matching: "w13_weight" matches "mlp.experts.w13_weight" but not "mlp.experts.w13_weight_scale".

offload_prefetch_step class-attribute instance-attribute

offload_prefetch_step: int = Field(default=1, ge=0)

Number of layers to prefetch ahead. Higher values hide more latency but use more GPU memory. Default is 1.

UVAOffloadConfig

Configuration for UVA (Unified Virtual Addressing) CPU offloading.

Uses zero-copy access from CPU-pinned memory. Simple but requires fast CPU-GPU interconnect.

Source code in vllm/config/offload.py
@config
class UVAOffloadConfig:
    """Configuration for UVA (Unified Virtual Addressing) CPU offloading.

    Uses zero-copy access from CPU-pinned memory. Simple but requires
    fast CPU-GPU interconnect.
    """

    cpu_offload_gb: float = Field(default=0, ge=0)
    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
    no offloading. Intuitively, this argument can be seen as a virtual way to
    increase the GPU memory size. For example, if you have one 24 GB GPU and
    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
    Note that this requires fast CPU-GPU interconnect, as part of the model is
    loaded from CPU memory to GPU memory on the fly in each model forward pass.
    This uses UVA (Unified Virtual Addressing) for zero-copy access.
    """

    cpu_offload_params: set[str] = Field(default_factory=set)
    """The set of parameter name segments to target for CPU offloading.
    Unmatched parameters are not offloaded. If this set is empty, parameters
    are offloaded non-selectively until the memory limit defined by
    `cpu_offload_gb` is reached.
    Examples:
        - For parameter name "mlp.experts.w2_weight":
            - "experts" or "experts.w2_weight" will match.
            - "expert" or "w2" will NOT match (must be exact segments).
    This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
    """

cpu_offload_gb class-attribute instance-attribute

cpu_offload_gb: float = Field(default=0, ge=0)

The space in GiB to offload to CPU, per GPU. Default is 0, which means no offloading. Intuitively, this argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass. This uses UVA (Unified Virtual Addressing) for zero-copy access.

cpu_offload_params class-attribute instance-attribute

cpu_offload_params: set[str] = Field(default_factory=set)

The set of parameter name segments to target for CPU offloading. Unmatched parameters are not offloaded. If this set is empty, parameters are offloaded non-selectively until the memory limit defined by cpu_offload_gb is reached. Examples: - For parameter name "mlp.experts.w2_weight": - "experts" or "experts.w2_weight" will match. - "expert" or "w2" will NOT match (must be exact segments). This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".