vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel ¶

ScaledMMLinearKernel ¶

Bases: ABC

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

class ScaledMMLinearKernel(ABC):
    @classmethod
    @abstractmethod
    def get_min_capability(cls) -> int:
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        raise NotImplementedError

    def __init__(
        self,
        c: ScaledMMLinearLayerConfig,
        w_q_param_name: str,
        w_s_param_name: str,
        i_s_param_name: str,
        i_zp_param_name: str,
        azp_adj_param_name: str,
    ) -> None:
        assert self.can_implement(c)
        self.config = c
        self.w_q_name = w_q_param_name
        self.w_s_name = w_s_param_name
        self.i_s_name = i_s_param_name
        self.i_zp_name = i_zp_param_name
        self.azp_adj_name = azp_adj_param_name

    @abstractmethod
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        raise NotImplementedError

    @abstractmethod
    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        raise NotImplementedError

    def _get_weight_params(
        self, layer: torch.nn.Module
    ) -> tuple[
        torch.Tensor,  # weight
        torch.Tensor,  # weight_scale
        torch.Tensor | None,  # input_scale,
        torch.Tensor | None,  # input_zp
        torch.Tensor | None,  # azp_adj
    ]:
        return (
            getattr(layer, self.w_q_name),
            getattr(layer, self.w_s_name),
            getattr(layer, self.i_s_name),
            getattr(layer, self.i_zp_name),
            getattr(layer, self.azp_adj_name),
        )

azp_adj_name `instance-attribute` ¶

azp_adj_name = azp_adj_param_name

config `instance-attribute` ¶

config = c

i_s_name `instance-attribute` ¶

i_s_name = i_s_param_name

i_zp_name `instance-attribute` ¶

i_zp_name = i_zp_param_name

w_q_name `instance-attribute` ¶

w_q_name = w_q_param_name

w_s_name `instance-attribute` ¶

w_s_name = w_s_param_name

init ¶

__init__(
    c: ScaledMMLinearLayerConfig,
    w_q_param_name: str,
    w_s_param_name: str,
    i_s_param_name: str,
    i_zp_param_name: str,
    azp_adj_param_name: str,
) -> None

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

def __init__(
    self,
    c: ScaledMMLinearLayerConfig,
    w_q_param_name: str,
    w_s_param_name: str,
    i_s_param_name: str,
    i_zp_param_name: str,
    azp_adj_param_name: str,
) -> None:
    assert self.can_implement(c)
    self.config = c
    self.w_q_name = w_q_param_name
    self.w_s_name = w_s_param_name
    self.i_s_name = i_s_param_name
    self.i_zp_name = i_zp_param_name
    self.azp_adj_name = azp_adj_param_name

_get_weight_params ¶

_get_weight_params(
    layer: Module,
) -> tuple[
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

def _get_weight_params(
    self, layer: torch.nn.Module
) -> tuple[
    torch.Tensor,  # weight
    torch.Tensor,  # weight_scale
    torch.Tensor | None,  # input_scale,
    torch.Tensor | None,  # input_zp
    torch.Tensor | None,  # azp_adj
]:
    return (
        getattr(layer, self.w_q_name),
        getattr(layer, self.w_s_name),
        getattr(layer, self.i_s_name),
        getattr(layer, self.i_zp_name),
        getattr(layer, self.azp_adj_name),
    )

apply_weights `abstractmethod` ¶

apply_weights(
    layer: Module, x: Tensor, bias: Tensor | None = None
) -> Tensor

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

@abstractmethod
def apply_weights(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    bias: torch.Tensor | None = None,
) -> torch.Tensor:
    raise NotImplementedError

can_implement `abstractmethod` `classmethod` ¶

can_implement(
    c: ScaledMMLinearLayerConfig,
) -> tuple[bool, str | None]

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

@classmethod
@abstractmethod
def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
    raise NotImplementedError

get_min_capability `abstractmethod` `classmethod` ¶

get_min_capability() -> int

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

@classmethod
@abstractmethod
def get_min_capability(cls) -> int:
    raise NotImplementedError

process_weights_after_loading `abstractmethod` ¶

process_weights_after_loading(layer: Module) -> None

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

@abstractmethod
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
    raise NotImplementedError

ScaledMMLinearLayerConfig `dataclass` ¶

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py

@dataclass
class ScaledMMLinearLayerConfig:
    is_channelwise: bool
    is_static_input_scheme: bool
    input_symmetric: bool

input_symmetric `instance-attribute` ¶

input_symmetric: bool

is_channelwise `instance-attribute` ¶

is_channelwise: bool

is_static_input_scheme `instance-attribute` ¶

is_static_input_scheme: bool

init ¶

__init__(
    is_channelwise: bool,
    is_static_input_scheme: bool,
    input_symmetric: bool,
) -> None

vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel ¶

ScaledMMLinearKernel ¶

azp_adj_name instance-attribute ¶

config instance-attribute ¶

i_s_name instance-attribute ¶

i_zp_name instance-attribute ¶

w_q_name instance-attribute ¶

w_s_name instance-attribute ¶

__init__ ¶

_get_weight_params ¶

apply_weights abstractmethod ¶

can_implement abstractmethod classmethod ¶

get_min_capability abstractmethod classmethod ¶

process_weights_after_loading abstractmethod ¶

ScaledMMLinearLayerConfig dataclass ¶

input_symmetric instance-attribute ¶

is_channelwise instance-attribute ¶

is_static_input_scheme instance-attribute ¶

__init__ ¶

azp_adj_name `instance-attribute` ¶

config `instance-attribute` ¶

i_s_name `instance-attribute` ¶

i_zp_name `instance-attribute` ¶

w_q_name `instance-attribute` ¶

w_s_name `instance-attribute` ¶

init ¶

apply_weights `abstractmethod` ¶

can_implement `abstractmethod` `classmethod` ¶

get_min_capability `abstractmethod` `classmethod` ¶

process_weights_after_loading `abstractmethod` ¶

ScaledMMLinearLayerConfig `dataclass` ¶

input_symmetric `instance-attribute` ¶

is_channelwise `instance-attribute` ¶

is_static_input_scheme `instance-attribute` ¶

init ¶