Skip to content

vllm.model_executor.layers.quantization.deepspeedfp

DeepSpeedFPConfig

Bases: QuantizationConfig

Config for DeepSpeed FP quantizer. It supports fp6 and fp8.

Parameters:

Name Type Description Default
weight_bits int

the target quantization bits, 6 or 8.

8
group_size int

group size for quantizaiton, default to 128.

512
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
class DeepSpeedFPConfig(QuantizationConfig):
    """Config for DeepSpeed FP quantizer. It supports fp6 and fp8.

    Args: 
        weight_bits: the target quantization bits, 6 or 8.
        group_size: group size for quantizaiton, default to 128.
    """

    def __init__(
        self,
        weight_bits: int = 8,
        group_size: int = 512,
    ) -> None:
        super().__init__()
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.valid_types = [torch.bfloat16, torch.float16]

        if self.weight_bits not in (6, 8):
            raise ValueError(
                "Currently, only 6-bit or 8-bit weight quantization are "
                f"supported for DeepSpeed FP quantizaiton, but got "
                f"{self.weight_bits} bits.")

    def __repr__(self) -> str:
        return (f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
                f"group_size={self.group_size}")

    @classmethod
    def get_name(cls) -> QuantizationMethods:
        return "deepspeedfp"

    @classmethod
    def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig":
        weight_bits = cls.get_from_keys(config, ["bits"])
        group_size = cls.get_from_keys(config, ["group_size"])
        return cls(weight_bits=weight_bits, group_size=group_size)

    def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
        return DeepSpeedFPLinearMethod(self)

    @classmethod
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
        return [torch.half, torch.bfloat16]

    @classmethod
    # Need to figure it out
    def get_min_capability(cls) -> int:
        return 60

    @staticmethod
    def get_config_filenames() -> list[str]:
        return [
            "quant_config.json",
            "quantize_config.json",
        ]

    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
        if isinstance(layer, LinearBase):
            return DeepSpeedFPLinearMethod(self)
        return None

group_size instance-attribute

group_size = group_size

valid_types instance-attribute

valid_types = [bfloat16, float16]

weight_bits instance-attribute

weight_bits = weight_bits

__init__

__init__(
    weight_bits: int = 8, group_size: int = 512
) -> None
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def __init__(
    self,
    weight_bits: int = 8,
    group_size: int = 512,
) -> None:
    super().__init__()
    self.weight_bits = weight_bits
    self.group_size = group_size
    self.valid_types = [torch.bfloat16, torch.float16]

    if self.weight_bits not in (6, 8):
        raise ValueError(
            "Currently, only 6-bit or 8-bit weight quantization are "
            f"supported for DeepSpeed FP quantizaiton, but got "
            f"{self.weight_bits} bits.")

__repr__

__repr__() -> str
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def __repr__(self) -> str:
    return (f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
            f"group_size={self.group_size}")

from_config classmethod

from_config(config: dict[str, Any]) -> DeepSpeedFPConfig
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
@classmethod
def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig":
    weight_bits = cls.get_from_keys(config, ["bits"])
    group_size = cls.get_from_keys(config, ["group_size"])
    return cls(weight_bits=weight_bits, group_size=group_size)

get_config_filenames staticmethod

get_config_filenames() -> list[str]
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
@staticmethod
def get_config_filenames() -> list[str]:
    return [
        "quant_config.json",
        "quantize_config.json",
    ]

get_linear_method

get_linear_method() -> DeepSpeedFPLinearMethod
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
    return DeepSpeedFPLinearMethod(self)

get_min_capability classmethod

get_min_capability() -> int
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
@classmethod
# Need to figure it out
def get_min_capability(cls) -> int:
    return 60

get_name classmethod

get_name() -> QuantizationMethods
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
@classmethod
def get_name(cls) -> QuantizationMethods:
    return "deepspeedfp"

get_quant_method

get_quant_method(
    layer: Module, prefix: str
) -> Optional[DeepSpeedFPLinearMethod]
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def get_quant_method(self, layer: torch.nn.Module,
                     prefix: str) -> Optional["DeepSpeedFPLinearMethod"]:
    if isinstance(layer, LinearBase):
        return DeepSpeedFPLinearMethod(self)
    return None

get_supported_act_dtypes classmethod

get_supported_act_dtypes() -> list[dtype]
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
    return [torch.half, torch.bfloat16]

DeepSpeedFPLinearMethod

Bases: LinearMethodBase

Linear method for DeepSpeedFP quantizer.

Parameters:

Name Type Description Default
quant_config DeepSpeedFPConfig

the DeepSpeedFP quantization config.

required
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
class DeepSpeedFPLinearMethod(LinearMethodBase):
    """Linear method for DeepSpeedFP quantizer.

    Args:
        quant_config: the DeepSpeedFP quantization config.
    """

    def __init__(self, quant_config: DeepSpeedFPConfig):
        self.quant_config = quant_config
        self.weight = None

    def create_weights(self,
                       layer: torch.nn.Module,
                       input_size_per_partition: int,
                       output_partition_sizes: list[int],
                       input_size: int,
                       output_size: int,
                       params_dtype: torch.dtype,
                       weight_loader=None,
                       **extra_weight_attrs):
        del output_size
        del input_size
        output_size_per_partition = sum(output_partition_sizes)
        weight = DeepSpeedFPParameter(
            torch.Size((output_size_per_partition, input_size_per_partition)),
            params_dtype=params_dtype,
            quant_config=self.quant_config,
        )
        set_weight_attrs(weight, {
            "input_dim": 1,
            "output_dim": 0,
        })
        layer.register_parameter("weight", weight)

        def quant_weight_loader(param, loaded_weight, *args, **kwargs):
            # Calls the original weight loader (if any), quantizes the result,
            # and then loads the quantized parameter.
            if weight_loader is not None:
                orig_param_data = param.data
                param.data = param.ds_dequantize()
                weight_loader(param, loaded_weight, *args, **kwargs)
                param.data, loaded_weight = orig_param_data, param.data
            param.ds_quantize_(loaded_weight.cuda())

        extra_weight_attrs["weight_loader"] = quant_weight_loader
        set_weight_attrs(weight, extra_weight_attrs)

    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        weight = layer.weight
        y = weight.ds_dequantize()
        return F.linear(x, y, bias)

quant_config instance-attribute

quant_config = quant_config

weight instance-attribute

weight = None

__init__

__init__(quant_config: DeepSpeedFPConfig)
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def __init__(self, quant_config: DeepSpeedFPConfig):
    self.quant_config = quant_config
    self.weight = None

apply

apply(
    layer: Module, x: Tensor, bias: Optional[Tensor] = None
) -> Tensor
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def apply(self,
          layer: torch.nn.Module,
          x: torch.Tensor,
          bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    weight = layer.weight
    y = weight.ds_dequantize()
    return F.linear(x, y, bias)

create_weights

create_weights(
    layer: Module,
    input_size_per_partition: int,
    output_partition_sizes: list[int],
    input_size: int,
    output_size: int,
    params_dtype: dtype,
    weight_loader=None,
    **extra_weight_attrs,
)
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def create_weights(self,
                   layer: torch.nn.Module,
                   input_size_per_partition: int,
                   output_partition_sizes: list[int],
                   input_size: int,
                   output_size: int,
                   params_dtype: torch.dtype,
                   weight_loader=None,
                   **extra_weight_attrs):
    del output_size
    del input_size
    output_size_per_partition = sum(output_partition_sizes)
    weight = DeepSpeedFPParameter(
        torch.Size((output_size_per_partition, input_size_per_partition)),
        params_dtype=params_dtype,
        quant_config=self.quant_config,
    )
    set_weight_attrs(weight, {
        "input_dim": 1,
        "output_dim": 0,
    })
    layer.register_parameter("weight", weight)

    def quant_weight_loader(param, loaded_weight, *args, **kwargs):
        # Calls the original weight loader (if any), quantizes the result,
        # and then loads the quantized parameter.
        if weight_loader is not None:
            orig_param_data = param.data
            param.data = param.ds_dequantize()
            weight_loader(param, loaded_weight, *args, **kwargs)
            param.data, loaded_weight = orig_param_data, param.data
        param.ds_quantize_(loaded_weight.cuda())

    extra_weight_attrs["weight_loader"] = quant_weight_loader
    set_weight_attrs(weight, extra_weight_attrs)

DeepSpeedFPParameter

Bases: Parameter

DeepSpeedFP quantized parameter class that implements fp8/fp6 quantization deepspeed. Weights are stored in quantized form on GPUs, and can be dequantized on-the-fly when needed by the model.

Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
class DeepSpeedFPParameter(nn.Parameter):
    """
    DeepSpeedFP quantized parameter class that implements fp8/fp6
    quantization deepspeed. Weights are stored in quantized form on
    GPUs, and can be dequantized on-the-fly when needed by the model.
    """

    def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
                quant_config: DeepSpeedFPConfig):
        try:
            import deepspeed
            if deepspeed.__version__ < "0.14.2":
                raise ImportError("deepspeed version is wrong. Please "
                                  "install deepspeed>=0.14.2.")
            from deepspeed.ops.fp_quantizer import FP_Quantize
        except ImportError as err:
            raise ImportError("Please install deepspeed>=0.14.2 via "
                              "`pip install deepspeed>=0.14.2` to use "
                              "deepspeedfp quantizer.") from err
        data = torch.empty((
            orig_shape.numel() // quant_config.group_size,
            quant_config.group_size * quant_config.weight_bits // 8 + 4,
        ),
                           dtype=torch.int8)
        self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
        self.orig_shape = orig_shape
        self.quant_config = quant_config
        self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
        self.fp_quantizer.orig_shape = orig_shape
        self.fp_quantizer.orig_dtype = params_dtype
        return self

    def ds_quantize_(self, tensor: torch.Tensor):
        assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
        return self.data.copy_(
            self.fp_quantizer.quantize(
                tensor.data,
                q_bits=self.quant_config.weight_bits,
            ))

    def ds_dequantize(self, fp_out=None) -> torch.Tensor:
        """
        Return a tensor containing the dequantized weights of this parameter.
        """
        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
        return self.fp_quantizer.dequantize(
            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits)

    def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
        """
        Return a tensor where only the weights at `indices` are dequantized
        (to save HBM -> SRAM bandwidth).
        """
        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
        return self.fp_quantizer.selective_dequantize(
            self.data,
            indices,
            fp_out=fp_out,
            q_bits=self.quant_config.weight_bits)

__new__

__new__(
    orig_shape: Size,
    params_dtype: dtype,
    quant_config: DeepSpeedFPConfig,
)
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def __new__(cls, orig_shape: torch.Size, params_dtype: torch.dtype,
            quant_config: DeepSpeedFPConfig):
    try:
        import deepspeed
        if deepspeed.__version__ < "0.14.2":
            raise ImportError("deepspeed version is wrong. Please "
                              "install deepspeed>=0.14.2.")
        from deepspeed.ops.fp_quantizer import FP_Quantize
    except ImportError as err:
        raise ImportError("Please install deepspeed>=0.14.2 via "
                          "`pip install deepspeed>=0.14.2` to use "
                          "deepspeedfp quantizer.") from err
    data = torch.empty((
        orig_shape.numel() // quant_config.group_size,
        quant_config.group_size * quant_config.weight_bits // 8 + 4,
    ),
                       dtype=torch.int8)
    self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
    self.orig_shape = orig_shape
    self.quant_config = quant_config
    self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
    self.fp_quantizer.orig_shape = orig_shape
    self.fp_quantizer.orig_dtype = params_dtype
    return self

ds_dequantize

ds_dequantize(fp_out=None) -> Tensor

Return a tensor containing the dequantized weights of this parameter.

Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def ds_dequantize(self, fp_out=None) -> torch.Tensor:
    """
    Return a tensor containing the dequantized weights of this parameter.
    """
    assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
    return self.fp_quantizer.dequantize(
        self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits)

ds_quantize_

ds_quantize_(tensor: Tensor)
Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def ds_quantize_(self, tensor: torch.Tensor):
    assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
    return self.data.copy_(
        self.fp_quantizer.quantize(
            tensor.data,
            q_bits=self.quant_config.weight_bits,
        ))

ds_selective_dequantize

ds_selective_dequantize(indices, fp_out=None) -> Tensor

Return a tensor where only the weights at indices are dequantized (to save HBM -> SRAM bandwidth).

Source code in vllm/model_executor/layers/quantization/deepspeedfp.py
def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
    """
    Return a tensor where only the weights at `indices` are dequantized
    (to save HBM -> SRAM bandwidth).
    """
    assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
    return self.fp_quantizer.selective_dequantize(
        self.data,
        indices,
        fp_out=fp_out,
        q_bits=self.quant_config.weight_bits)