vllm.model_executor.layers.quantization

Modules:

Name	Description
`aqlm`
`auto_round`
`awq`
`awq_marlin`
`awq_triton`
`base_config`
`bitblas`
`bitsandbytes`
`compressed_tensors`
`deepgemm`
`deepspeedfp`
`experts_int8`
`fbgemm_fp8`
`fp8`
`gguf`
`gptq`
`gptq_bitblas`
`gptq_marlin`
`gptq_marlin_24`
`hqq_marlin`
`ipex_quant`
`kernels`
`kv_cache`
`marlin`
`modelopt`
`moe_wna16`
`neuron_quant`
`ptpc_fp8`
`qqq`
`quark`
`rtn`
`schema`	This file contains the Pydantic schemas for various quantization-related
`torchao`
`tpu_int8`
`utils`

QUANTIZATION_METHODS `module-attribute` ¶

QUANTIZATION_METHODS: list[str] = list(
    get_args(QuantizationMethods)
)

QuantizationMethods `module-attribute` ¶

QuantizationMethods = Literal[
    "aqlm",
    "awq",
    "deepspeedfp",
    "tpu_int8",
    "fp8",
    "ptpc_fp8",
    "fbgemm_fp8",
    "modelopt",
    "modelopt_fp4",
    "marlin",
    "bitblas",
    "gguf",
    "gptq_marlin_24",
    "gptq_marlin",
    "gptq_bitblas",
    "awq_marlin",
    "gptq",
    "compressed-tensors",
    "bitsandbytes",
    "qqq",
    "hqq",
    "experts_int8",
    "neuron_quant",
    "ipex",
    "quark",
    "moe_wna16",
    "torchao",
    "auto-round",
    "rtn",
]

_CUSTOMIZED_METHOD_TO_QUANT_CONFIG `module-attribute` ¶

_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}

all `module-attribute` ¶

__all__ = [
    "QuantizationConfig",
    "QuantizationMethods",
    "get_quantization_config",
    "QUANTIZATION_METHODS",
]

QuantizationConfig ¶

Bases: ABC

Base class for quantization configs.

Source code in vllm/model_executor/layers/quantization/base_config.py

class QuantizationConfig(ABC):
    """Base class for quantization configs."""

    def __init__(self):
        super().__init__()
        # mapping is updated by models as they initialize
        self.packed_modules_mapping: dict[str, list[str]] = dict()

    @abstractmethod
    def get_name(self) -> QuantizationMethods:
        """Name of the quantization method."""
        raise NotImplementedError

    @abstractmethod
    def get_supported_act_dtypes(self) -> list[torch.dtype]:
        """List of supported activation dtypes."""
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def get_min_capability(cls) -> int:
        """Minimum GPU capability to support the quantization method.

        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
        This requirement is due to the custom CUDA kernels used by the
        quantization method.
        """
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def get_config_filenames() -> list[str]:
        """List of filenames to search for in the model directory."""
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
        """Create a config class from the model's quantization config."""
        raise NotImplementedError

    @classmethod
    def override_quantization_method(
            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
        """
           Detects if this quantization method can support a given checkpoint
           format by overriding the user specified quantization method -- 
           this method should only be overwritten by subclasses in exceptional 
           circumstances
        """
        return None

    @staticmethod
    def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
        """Get a value from the model's quantization config."""
        for key in keys:
            if key in config:
                return config[key]
        raise ValueError(f"Cannot find any of {keys} in the model's "
                         "quantization config.")

    @staticmethod
    def get_from_keys_or(config: dict[str, Any], keys: list[str],
                         default: Any) -> Any:
        """Get a optional value from the model's quantization config."""
        try:
            return QuantizationConfig.get_from_keys(config, keys)
        except ValueError:
            return default

    @abstractmethod
    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional[QuantizeMethodBase]:
        """Get the quantize method to use for the quantized layer.

        Args:
            layer: The layer for the quant method.
            prefix: The full name of the layer in the state dict
        Returns:
            The quantize method. None if the given layer doesn't support quant
            method.
        """
        raise NotImplementedError

    def get_cache_scale(self, name: str) -> Optional[str]:
        return None

    def apply_vllm_mapper(  # noqa: B027
            self, hf_to_vllm_mapper: "WeightsMapper"):
        """
        Interface for models to update module names referenced in
        quantization configs in order to reflect the vllm model structure

        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
            structure of the qconfig) to vllm model structure
        """
        # TODO (@kylesayrs): add implementations for all subclasses
        pass

packed_modules_mapping `instance-attribute` ¶

packed_modules_mapping: dict[str, list[str]] = dict()

init ¶

__init__()

Source code in vllm/model_executor/layers/quantization/base_config.py

def __init__(self):
    super().__init__()
    # mapping is updated by models as they initialize
    self.packed_modules_mapping: dict[str, list[str]] = dict()

apply_vllm_mapper ¶

apply_vllm_mapper(hf_to_vllm_mapper: WeightsMapper)

Interface for models to update module names referenced in quantization configs in order to reflect the vllm model structure

:param hf_to_vllm_mapper: maps from hf model structure (the assumed structure of the qconfig) to vllm model structure

Source code in vllm/model_executor/layers/quantization/base_config.py

def apply_vllm_mapper(  # noqa: B027
        self, hf_to_vllm_mapper: "WeightsMapper"):
    """
    Interface for models to update module names referenced in
    quantization configs in order to reflect the vllm model structure

    :param hf_to_vllm_mapper: maps from hf model structure (the assumed
        structure of the qconfig) to vllm model structure
    """
    # TODO (@kylesayrs): add implementations for all subclasses
    pass

from_config `abstractmethod` `classmethod` ¶

from_config(config: dict[str, Any]) -> QuantizationConfig

Create a config class from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
@abstractmethod
def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
    """Create a config class from the model's quantization config."""
    raise NotImplementedError

get_cache_scale ¶

get_cache_scale(name: str) -> Optional[str]

Source code in vllm/model_executor/layers/quantization/base_config.py

def get_cache_scale(self, name: str) -> Optional[str]:
    return None

get_config_filenames `abstractmethod` `staticmethod` ¶

get_config_filenames() -> list[str]

List of filenames to search for in the model directory.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
@abstractmethod
def get_config_filenames() -> list[str]:
    """List of filenames to search for in the model directory."""
    raise NotImplementedError

get_from_keys `staticmethod` ¶

get_from_keys(
    config: dict[str, Any], keys: list[str]
) -> Any

Get a value from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
    """Get a value from the model's quantization config."""
    for key in keys:
        if key in config:
            return config[key]
    raise ValueError(f"Cannot find any of {keys} in the model's "
                     "quantization config.")

get_from_keys_or `staticmethod` ¶

get_from_keys_or(
    config: dict[str, Any], keys: list[str], default: Any
) -> Any

Get a optional value from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
def get_from_keys_or(config: dict[str, Any], keys: list[str],
                     default: Any) -> Any:
    """Get a optional value from the model's quantization config."""
    try:
        return QuantizationConfig.get_from_keys(config, keys)
    except ValueError:
        return default

get_min_capability `abstractmethod` `classmethod` ¶

get_min_capability() -> int

Minimum GPU capability to support the quantization method.

E.g., 70 for Volta, 75 for Turing, 80 for Ampere. This requirement is due to the custom CUDA kernels used by the quantization method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
@abstractmethod
def get_min_capability(cls) -> int:
    """Minimum GPU capability to support the quantization method.

    E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
    This requirement is due to the custom CUDA kernels used by the
    quantization method.
    """
    raise NotImplementedError

get_name `abstractmethod` ¶

get_name() -> QuantizationMethods

Name of the quantization method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_name(self) -> QuantizationMethods:
    """Name of the quantization method."""
    raise NotImplementedError

get_quant_method `abstractmethod` ¶

get_quant_method(
    layer: Module, prefix: str
) -> Optional[QuantizeMethodBase]

Get the quantize method to use for the quantized layer.

Parameters:

Name	Type	Description	Default
`layer`	`Module`	The layer for the quant method.	required
`prefix`	`str`	The full name of the layer in the state dict	required

Returns: The quantize method. None if the given layer doesn't support quant method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_quant_method(self, layer: torch.nn.Module,
                     prefix: str) -> Optional[QuantizeMethodBase]:
    """Get the quantize method to use for the quantized layer.

    Args:
        layer: The layer for the quant method.
        prefix: The full name of the layer in the state dict
    Returns:
        The quantize method. None if the given layer doesn't support quant
        method.
    """
    raise NotImplementedError

get_supported_act_dtypes `abstractmethod` ¶

get_supported_act_dtypes() -> list[dtype]

List of supported activation dtypes.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_supported_act_dtypes(self) -> list[torch.dtype]:
    """List of supported activation dtypes."""
    raise NotImplementedError

override_quantization_method `classmethod` ¶

override_quantization_method(
    hf_quant_cfg, user_quant
) -> Optional[QuantizationMethods]

Detects if this quantization method can support a given checkpoint format by overriding the user specified quantization method -- this method should only be overwritten by subclasses in exceptional circumstances

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
def override_quantization_method(
        cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
    """
       Detects if this quantization method can support a given checkpoint
       format by overriding the user specified quantization method -- 
       this method should only be overwritten by subclasses in exceptional 
       circumstances
    """
    return None

get_quantization_config ¶

get_quantization_config(
    quantization: str,
) -> type[QuantizationConfig]

Source code in vllm/model_executor/layers/quantization/__init__.py

def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
    if quantization not in QUANTIZATION_METHODS:
        raise ValueError(f"Invalid quantization method: {quantization}")

    # lazy import to avoid triggering `torch.compile` too early
    from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig

    from .aqlm import AQLMConfig
    from .auto_round import AutoRoundConfig
    from .awq import AWQConfig
    from .awq_marlin import AWQMarlinConfig
    from .bitblas import BitBLASConfig
    from .bitsandbytes import BitsAndBytesConfig
    from .compressed_tensors.compressed_tensors import (  # noqa: E501
        CompressedTensorsConfig)
    from .deepspeedfp import DeepSpeedFPConfig
    from .experts_int8 import ExpertsInt8Config
    from .fbgemm_fp8 import FBGEMMFp8Config
    from .fp8 import Fp8Config
    from .gguf import GGUFConfig
    from .gptq import GPTQConfig
    from .gptq_bitblas import GPTQBitBLASConfig
    from .gptq_marlin import GPTQMarlinConfig
    from .gptq_marlin_24 import GPTQMarlin24Config
    from .hqq_marlin import HQQMarlinConfig
    from .ipex_quant import IPEXConfig
    from .marlin import MarlinConfig
    from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
    from .moe_wna16 import MoeWNA16Config
    from .neuron_quant import NeuronQuantConfig
    from .ptpc_fp8 import PTPCFp8Config
    from .qqq import QQQConfig
    from .rtn import RTNConfig
    from .torchao import TorchAOConfig
    from .tpu_int8 import Int8TpuConfig

    method_to_config: dict[str, type[QuantizationConfig]] = {
        "aqlm": AQLMConfig,
        "awq": AWQConfig,
        "deepspeedfp": DeepSpeedFPConfig,
        "tpu_int8": Int8TpuConfig,
        "fp8": Fp8Config,
        "fbgemm_fp8": FBGEMMFp8Config,
        "modelopt": ModelOptFp8Config,
        "modelopt_fp4": ModelOptNvFp4Config,
        "marlin": MarlinConfig,
        "bitblas": BitBLASConfig,
        "gguf": GGUFConfig,
        "gptq_marlin_24": GPTQMarlin24Config,
        "gptq_marlin": GPTQMarlinConfig,
        "gptq_bitblas": GPTQBitBLASConfig,
        "awq_marlin": AWQMarlinConfig,
        "gptq": GPTQConfig,
        "compressed-tensors": CompressedTensorsConfig,
        "bitsandbytes": BitsAndBytesConfig,
        "ptpc_fp8": PTPCFp8Config,
        "qqq": QQQConfig,
        "hqq": HQQMarlinConfig,
        "experts_int8": ExpertsInt8Config,
        "neuron_quant": NeuronQuantConfig,
        "ipex": IPEXConfig,
        "quark": QuarkConfig,
        "moe_wna16": MoeWNA16Config,
        "torchao": TorchAOConfig,
        "auto-round": AutoRoundConfig,
        "rtn": RTNConfig
    }
    # Update the `method_to_config` with customized quantization methods.
    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)

    return method_to_config[quantization]

register_quantization_config ¶

register_quantization_config(quantization: str)

Register a customized vllm quantization config.

When a quantization method is not supported by vllm, you can register a customized quantization config to support it.

Parameters:

Name	Type	Description	Default
`quantization`	`str`	The quantization method name.	required

Examples:

>>> from vllm.model_executor.layers.quantization import register_quantization_config
>>> from vllm.model_executor.layers.quantization import get_quantization_config
>>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
>>>
>>> @register_quantization_config("my_quant")
... class MyQuantConfig(QuantizationConfig):
...     pass
>>>
>>> get_quantization_config("my_quant")
<class 'MyQuantConfig'>

Source code in vllm/model_executor/layers/quantization/__init__.py

def register_quantization_config(quantization: str):
    """Register a customized vllm quantization config.

    When a quantization method is not supported by vllm, you can register a customized
    quantization config to support it.

    Args:
        quantization (str): The quantization method name.

    Examples:
        >>> from vllm.model_executor.layers.quantization import register_quantization_config
        >>> from vllm.model_executor.layers.quantization import get_quantization_config
        >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
        >>>
        >>> @register_quantization_config("my_quant")
        ... class MyQuantConfig(QuantizationConfig):
        ...     pass
        >>>
        >>> get_quantization_config("my_quant")
        <class 'MyQuantConfig'>
    """  # noqa: E501

    def _wrapper(quant_config_cls):
        if quantization in QUANTIZATION_METHODS:
            raise ValueError(
                f"The quantization method `{quantization}` is already exists.")
        if not issubclass(quant_config_cls, QuantizationConfig):
            raise ValueError("The quantization config must be a subclass of "
                             "`QuantizationConfig`.")
        _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
        QUANTIZATION_METHODS.append(quantization)
        return quant_config_cls

    return _wrapper

vllm.model_executor.layers.quantization

QUANTIZATION_METHODS module-attribute ¶

QuantizationMethods module-attribute ¶

_CUSTOMIZED_METHOD_TO_QUANT_CONFIG module-attribute ¶

__all__ module-attribute ¶

QuantizationConfig ¶

packed_modules_mapping instance-attribute ¶

__init__ ¶

apply_vllm_mapper ¶

from_config abstractmethod classmethod ¶

get_cache_scale ¶

get_config_filenames abstractmethod staticmethod ¶

get_from_keys staticmethod ¶

get_from_keys_or staticmethod ¶

get_min_capability abstractmethod classmethod ¶

get_name abstractmethod ¶

get_quant_method abstractmethod ¶

get_supported_act_dtypes abstractmethod ¶

override_quantization_method classmethod ¶

get_quantization_config ¶

register_quantization_config ¶

QUANTIZATION_METHODS `module-attribute` ¶

QuantizationMethods `module-attribute` ¶

_CUSTOMIZED_METHOD_TO_QUANT_CONFIG `module-attribute` ¶

all `module-attribute` ¶

packed_modules_mapping `instance-attribute` ¶

init ¶

from_config `abstractmethod` `classmethod` ¶

get_config_filenames `abstractmethod` `staticmethod` ¶

get_from_keys `staticmethod` ¶

get_from_keys_or `staticmethod` ¶

get_min_capability `abstractmethod` `classmethod` ¶

get_name `abstractmethod` ¶

get_quant_method `abstractmethod` ¶

get_supported_act_dtypes `abstractmethod` ¶

override_quantization_method `classmethod` ¶