Skip to content

vllm.lora.worker_manager

logger module-attribute

logger = init_logger(__name__)

LRUCacheWorkerLoRAManager

Bases: WorkerLoRAManager

WorkerLoRAManager that manages LoRA models on the worker side.

Uses an LRU Cache. Every request, the requested LoRAs will be loaded (unless they are already loaded) and least recently used LoRAs will be unloaded if the cache is above capacity.

Source code in vllm/lora/worker_manager.py
class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Uses an LRU Cache. Every request, the requested LoRAs will be loaded
    (unless they are already loaded) and least recently used LoRAs will
    be unloaded if the cache is above capacity."""

    _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager

    def create_lora_manager(
        self,
        model: torch.nn.Module,
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
            lora_manager_cls=self._manager_cls,
            max_num_seqs=self.max_num_seqs,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
            device=self.device,
            max_num_batched_tokens=self.max_num_batched_tokens,
        )
        self._adapter_manager = lora_manager
        return lora_manager.model

    def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None:
        loras_map = {
            lora_request.lora_int_id: lora_request
            for lora_request in lora_requests if lora_request
        }
        if len(loras_map) > self._adapter_manager.lora_slots:
            raise RuntimeError(
                f"Number of requested LoRAs ({len(loras_map)}) is greater "
                "than the number of GPU LoRA slots "
                f"({self._adapter_manager.lora_slots}).")
        for lora in loras_map.values():
            self.add_adapter(lora)

    def add_adapter(self, lora_request: LoRARequest) -> bool:
        # Note that this method is not thread-safe. It may be invoked multiple
        # times for the same adapter when using multiple API servers.
        # This is ok because it's currently only called from
        # the single-threaded core engine loop.

        if lora_request.lora_int_id not in self.list_adapters():
            # Load the new adapter first to ensure it is actually valid, before
            # evicting any existing adapters.
            # This may cause the # of loaded lora adapters to very temporarily
            # exceed `--max-cpu-loras`.
            lora = self._load_adapter(lora_request)

            # Loading succeeded, now check if we will exceed cache capacity and
            # evict if the oldest adapter if so
            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                assert isinstance(self._adapter_manager,
                                  LRUCacheLoRAModelManager)
                self._adapter_manager.remove_oldest_adapter()
            # Then add the new adapter to the cache
            loaded = self._adapter_manager.add_adapter(lora)
        else:
            # If the lora is already loaded, just touch it to
            # update its position in the caches
            loaded = self._adapter_manager.get_adapter(
                lora_request.lora_int_id) is not None
        self._adapter_manager.activate_adapter(lora_request.lora_int_id)
        return loaded

_manager_cls class-attribute instance-attribute

_apply_adapters

_apply_adapters(lora_requests: set[LoRARequest]) -> None
Source code in vllm/lora/worker_manager.py
def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None:
    loras_map = {
        lora_request.lora_int_id: lora_request
        for lora_request in lora_requests if lora_request
    }
    if len(loras_map) > self._adapter_manager.lora_slots:
        raise RuntimeError(
            f"Number of requested LoRAs ({len(loras_map)}) is greater "
            "than the number of GPU LoRA slots "
            f"({self._adapter_manager.lora_slots}).")
    for lora in loras_map.values():
        self.add_adapter(lora)

add_adapter

add_adapter(lora_request: LoRARequest) -> bool
Source code in vllm/lora/worker_manager.py
def add_adapter(self, lora_request: LoRARequest) -> bool:
    # Note that this method is not thread-safe. It may be invoked multiple
    # times for the same adapter when using multiple API servers.
    # This is ok because it's currently only called from
    # the single-threaded core engine loop.

    if lora_request.lora_int_id not in self.list_adapters():
        # Load the new adapter first to ensure it is actually valid, before
        # evicting any existing adapters.
        # This may cause the # of loaded lora adapters to very temporarily
        # exceed `--max-cpu-loras`.
        lora = self._load_adapter(lora_request)

        # Loading succeeded, now check if we will exceed cache capacity and
        # evict if the oldest adapter if so
        if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
            assert isinstance(self._adapter_manager,
                              LRUCacheLoRAModelManager)
            self._adapter_manager.remove_oldest_adapter()
        # Then add the new adapter to the cache
        loaded = self._adapter_manager.add_adapter(lora)
    else:
        # If the lora is already loaded, just touch it to
        # update its position in the caches
        loaded = self._adapter_manager.get_adapter(
            lora_request.lora_int_id) is not None
    self._adapter_manager.activate_adapter(lora_request.lora_int_id)
    return loaded

create_lora_manager

create_lora_manager(model: Module) -> Any
Source code in vllm/lora/worker_manager.py
def create_lora_manager(
    self,
    model: torch.nn.Module,
) -> Any:
    lora_manager = create_lora_manager(
        model,
        lora_manager_cls=self._manager_cls,
        max_num_seqs=self.max_num_seqs,
        vocab_size=self.vocab_size,
        lora_config=self.lora_config,
        device=self.device,
        max_num_batched_tokens=self.max_num_batched_tokens,
    )
    self._adapter_manager = lora_manager
    return lora_manager.model

WorkerLoRAManager

Bases: AbstractWorkerManager

WorkerLoRAManager that manages LoRA models on the worker side.

Every request, the requested LoRAs will be loaded (unless they are already loaded), and every other LoRA will be unloaded.

Source code in vllm/lora/worker_manager.py
class WorkerLoRAManager(AbstractWorkerManager):
    """WorkerLoRAManager that manages LoRA models on the worker side.

    Every request, the requested LoRAs will be loaded (unless they are already
    loaded), and every other LoRA will be unloaded."""

    _manager_cls: type[LoRAModelManager] = LoRAModelManager

    def __init__(
        self,
        max_num_seqs: int,
        max_num_batched_tokens: int,
        vocab_size: int,
        lora_config: LoRAConfig,
        device: torch.device,
        embedding_modules: dict[str, str],
        embedding_padding_modules: list[str],
        lora_model_cls: type[LoRAModel] = LoRAModel,
        max_position_embeddings: Optional[int] = None,
    ):
        self._lora_model_cls = lora_model_cls
        self.embedding_modules = embedding_modules
        self.embedding_padding_modules = embedding_padding_modules
        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
        self.max_num_seqs = max_num_seqs
        self.max_num_batched_tokens = max_num_batched_tokens
        self.vocab_size = vocab_size
        self.lora_config = lora_config
        self.max_position_embeddings = max_position_embeddings
        super().__init__(device)
        # Lazily initialized by create_lora_manager.
        self._adapter_manager: LoRAModelManager

    @contextmanager
    def dummy_lora_cache(self):
        """Use this context manager to reuse the dummy lora model
        to avoid creating it repeatedly."""
        self._cached_dummy_lora = None
        yield
        self._cached_dummy_lora = False

    @property
    def is_enabled(self) -> bool:
        return True

    def create_lora_manager(
        self,
        model: torch.nn.Module,
    ) -> Any:
        lora_manager = create_lora_manager(
            model,
            max_num_seqs=self.max_num_seqs,
            max_num_batched_tokens=self.max_num_batched_tokens,
            vocab_size=self.vocab_size,
            lora_config=self.lora_config,
            device=self.device,
            lora_manager_cls=self._manager_cls,
        )
        self._adapter_manager = lora_manager
        return lora_manager.model

    def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
        try:
            supported_lora_modules = (
                self._adapter_manager.supported_lora_modules)
            packed_modules_mapping = (
                self._adapter_manager.packed_modules_mapping)
            expected_lora_modules: list[str] = []
            for module in supported_lora_modules:
                if module in packed_modules_mapping:
                    expected_lora_modules.extend(
                        packed_modules_mapping[module])
                else:
                    expected_lora_modules.append(module)

            expected_lora_modules = list(set(expected_lora_modules))
            lora_path = get_adapter_absolute_path(lora_request.lora_path)

            peft_helper = PEFTHelper.from_local_dir(
                lora_path, self.max_position_embeddings,
                lora_request.tensorizer_config_dict)

            # Validates the LoRA configuration against requirements before
            # loading weights, throwing an exception if validation fails.
            peft_helper.validate_legal(self.lora_config)

            # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
            # to ensure correct loading of lora weights.
            model = self._adapter_manager.model
            hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)

            lora = self._lora_model_cls.from_local_checkpoint(
                lora_path,
                expected_lora_modules,
                peft_helper=peft_helper,
                lora_model_id=lora_request.lora_int_id,
                device="cpu",
                dtype=self.lora_config.lora_dtype,
                target_embedding_padding=self.vocab_size +
                self.lora_config.lora_extra_vocab_size,
                embedding_modules=self.embedding_modules,
                embedding_padding_modules=self.embedding_padding_modules,
                tensorizer_config_dict=lora_request.tensorizer_config_dict,
                weights_mapper=hf_to_vllm_mapper)

        except FileNotFoundError as e:
            # FileNotFoundError should be raised if both
            # - No adapter found to download from huggingface (or in
            #       offline mode)
            # - No local adapter files found at `lora_request.lora_path`
            # For NotFoundError
            raise ValueError(
                f"Loading lora {lora_request.lora_name} failed: No adapter "
                f"found for {lora_request.lora_path}") from e
        except Exception as e:
            # For BadRequestError
            raise e

        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
                             f"is greater than lora_extra_vocab_size "
                             f"{self.lora_config.lora_extra_vocab_size}.")
        return lora

    def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
        if lora_request.lora_int_id in self.list_adapters():
            return False
        if isinstance(self._cached_dummy_lora, LoRAModel):
            dummy_lora = self._cached_dummy_lora.clone(
                lora_request.lora_int_id)
        else:
            dummy_lora = self._adapter_manager.create_dummy_lora(
                lora_request.lora_int_id, rank, 1, self.embedding_modules)
            if self._cached_dummy_lora is None:
                self._cached_dummy_lora = dummy_lora
        return self._adapter_manager.add_adapter(dummy_lora)

    def pin_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.pin_adapter(adapter_id)

    def set_active_adapters(self, requests: set[Any],
                            mapping: Optional[Any]) -> None:
        set_active_adapters_worker(requests, mapping, self._apply_adapters,
                                   self._adapter_manager.set_adapter_mapping)

    def _apply_adapters(self, adapter_requests: set[Any]) -> None:
        apply_adapters_worker(adapter_requests, self.list_adapters,
                              self._adapter_manager.adapter_slots,
                              self.remove_adapter, self.add_adapter)

    def add_adapter(self, adapter_request: Any) -> bool:
        return add_adapter_worker(adapter_request, self.list_adapters,
                                  self._load_adapter,
                                  self._adapter_manager.add_adapter,
                                  self._adapter_manager.activate_adapter)

    def remove_adapter(self, adapter_id: int) -> bool:
        return self._adapter_manager.remove_adapter(adapter_id)

    def remove_all_adapters(self):
        self._adapter_manager.remove_all_adapters()

    def list_adapters(self) -> set[int]:
        return list_adapters_worker(self._adapter_manager.list_adapters)

_adapter_manager instance-attribute

_adapter_manager: LoRAModelManager

_cached_dummy_lora instance-attribute

_cached_dummy_lora: Union[
    None, Literal[False], LoRAModel
] = False

_lora_model_cls instance-attribute

_lora_model_cls = lora_model_cls

_manager_cls class-attribute instance-attribute

embedding_modules instance-attribute

embedding_modules = embedding_modules

embedding_padding_modules instance-attribute

embedding_padding_modules = embedding_padding_modules

is_enabled property

is_enabled: bool

lora_config instance-attribute

lora_config = lora_config

max_num_batched_tokens instance-attribute

max_num_batched_tokens = max_num_batched_tokens

max_num_seqs instance-attribute

max_num_seqs = max_num_seqs

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    max_num_seqs: int,
    max_num_batched_tokens: int,
    vocab_size: int,
    lora_config: LoRAConfig,
    device: device,
    embedding_modules: dict[str, str],
    embedding_padding_modules: list[str],
    lora_model_cls: type[LoRAModel] = LoRAModel,
    max_position_embeddings: Optional[int] = None,
)
Source code in vllm/lora/worker_manager.py
def __init__(
    self,
    max_num_seqs: int,
    max_num_batched_tokens: int,
    vocab_size: int,
    lora_config: LoRAConfig,
    device: torch.device,
    embedding_modules: dict[str, str],
    embedding_padding_modules: list[str],
    lora_model_cls: type[LoRAModel] = LoRAModel,
    max_position_embeddings: Optional[int] = None,
):
    self._lora_model_cls = lora_model_cls
    self.embedding_modules = embedding_modules
    self.embedding_padding_modules = embedding_padding_modules
    self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
    self.max_num_seqs = max_num_seqs
    self.max_num_batched_tokens = max_num_batched_tokens
    self.vocab_size = vocab_size
    self.lora_config = lora_config
    self.max_position_embeddings = max_position_embeddings
    super().__init__(device)
    # Lazily initialized by create_lora_manager.
    self._adapter_manager: LoRAModelManager

_apply_adapters

_apply_adapters(adapter_requests: set[Any]) -> None
Source code in vllm/lora/worker_manager.py
def _apply_adapters(self, adapter_requests: set[Any]) -> None:
    apply_adapters_worker(adapter_requests, self.list_adapters,
                          self._adapter_manager.adapter_slots,
                          self.remove_adapter, self.add_adapter)

_load_adapter

_load_adapter(lora_request: LoRARequest) -> LoRAModel
Source code in vllm/lora/worker_manager.py
def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
    try:
        supported_lora_modules = (
            self._adapter_manager.supported_lora_modules)
        packed_modules_mapping = (
            self._adapter_manager.packed_modules_mapping)
        expected_lora_modules: list[str] = []
        for module in supported_lora_modules:
            if module in packed_modules_mapping:
                expected_lora_modules.extend(
                    packed_modules_mapping[module])
            else:
                expected_lora_modules.append(module)

        expected_lora_modules = list(set(expected_lora_modules))
        lora_path = get_adapter_absolute_path(lora_request.lora_path)

        peft_helper = PEFTHelper.from_local_dir(
            lora_path, self.max_position_embeddings,
            lora_request.tensorizer_config_dict)

        # Validates the LoRA configuration against requirements before
        # loading weights, throwing an exception if validation fails.
        peft_helper.validate_legal(self.lora_config)

        # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
        # to ensure correct loading of lora weights.
        model = self._adapter_manager.model
        hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)

        lora = self._lora_model_cls.from_local_checkpoint(
            lora_path,
            expected_lora_modules,
            peft_helper=peft_helper,
            lora_model_id=lora_request.lora_int_id,
            device="cpu",
            dtype=self.lora_config.lora_dtype,
            target_embedding_padding=self.vocab_size +
            self.lora_config.lora_extra_vocab_size,
            embedding_modules=self.embedding_modules,
            embedding_padding_modules=self.embedding_padding_modules,
            tensorizer_config_dict=lora_request.tensorizer_config_dict,
            weights_mapper=hf_to_vllm_mapper)

    except FileNotFoundError as e:
        # FileNotFoundError should be raised if both
        # - No adapter found to download from huggingface (or in
        #       offline mode)
        # - No local adapter files found at `lora_request.lora_path`
        # For NotFoundError
        raise ValueError(
            f"Loading lora {lora_request.lora_name} failed: No adapter "
            f"found for {lora_request.lora_path}") from e
    except Exception as e:
        # For BadRequestError
        raise e

    if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
        raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
                         f"is greater than lora_extra_vocab_size "
                         f"{self.lora_config.lora_extra_vocab_size}.")
    return lora

add_adapter

add_adapter(adapter_request: Any) -> bool
Source code in vllm/lora/worker_manager.py
def add_adapter(self, adapter_request: Any) -> bool:
    return add_adapter_worker(adapter_request, self.list_adapters,
                              self._load_adapter,
                              self._adapter_manager.add_adapter,
                              self._adapter_manager.activate_adapter)

add_dummy_lora

add_dummy_lora(
    lora_request: LoRARequest, rank: int
) -> bool
Source code in vllm/lora/worker_manager.py
def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
    if lora_request.lora_int_id in self.list_adapters():
        return False
    if isinstance(self._cached_dummy_lora, LoRAModel):
        dummy_lora = self._cached_dummy_lora.clone(
            lora_request.lora_int_id)
    else:
        dummy_lora = self._adapter_manager.create_dummy_lora(
            lora_request.lora_int_id, rank, 1, self.embedding_modules)
        if self._cached_dummy_lora is None:
            self._cached_dummy_lora = dummy_lora
    return self._adapter_manager.add_adapter(dummy_lora)

create_lora_manager

create_lora_manager(model: Module) -> Any
Source code in vllm/lora/worker_manager.py
def create_lora_manager(
    self,
    model: torch.nn.Module,
) -> Any:
    lora_manager = create_lora_manager(
        model,
        max_num_seqs=self.max_num_seqs,
        max_num_batched_tokens=self.max_num_batched_tokens,
        vocab_size=self.vocab_size,
        lora_config=self.lora_config,
        device=self.device,
        lora_manager_cls=self._manager_cls,
    )
    self._adapter_manager = lora_manager
    return lora_manager.model

dummy_lora_cache

dummy_lora_cache()

Use this context manager to reuse the dummy lora model to avoid creating it repeatedly.

Source code in vllm/lora/worker_manager.py
@contextmanager
def dummy_lora_cache(self):
    """Use this context manager to reuse the dummy lora model
    to avoid creating it repeatedly."""
    self._cached_dummy_lora = None
    yield
    self._cached_dummy_lora = False

list_adapters

list_adapters() -> set[int]
Source code in vllm/lora/worker_manager.py
def list_adapters(self) -> set[int]:
    return list_adapters_worker(self._adapter_manager.list_adapters)

pin_adapter

pin_adapter(adapter_id: int) -> bool
Source code in vllm/lora/worker_manager.py
def pin_adapter(self, adapter_id: int) -> bool:
    return self._adapter_manager.pin_adapter(adapter_id)

remove_adapter

remove_adapter(adapter_id: int) -> bool
Source code in vllm/lora/worker_manager.py
def remove_adapter(self, adapter_id: int) -> bool:
    return self._adapter_manager.remove_adapter(adapter_id)

remove_all_adapters

remove_all_adapters()
Source code in vllm/lora/worker_manager.py
def remove_all_adapters(self):
    self._adapter_manager.remove_all_adapters()

set_active_adapters

set_active_adapters(
    requests: set[Any], mapping: Optional[Any]
) -> None
Source code in vllm/lora/worker_manager.py
def set_active_adapters(self, requests: set[Any],
                        mapping: Optional[Any]) -> None:
    set_active_adapters_worker(requests, mapping, self._apply_adapters,
                               self._adapter_manager.set_adapter_mapping)