vllm.model_executor.layers.activation

Custom activation functions.

_ACTIVATION_AND_MUL_REGISTRY `module-attribute` ¶

_ACTIVATION_AND_MUL_REGISTRY = LazyDict(
    {
        "gelu": lambda: GeluAndMul(),
        "silu": lambda: SiluAndMul(),
        "geglu": lambda: GeluAndMul(),
    }
)

_ACTIVATION_REGISTRY `module-attribute` ¶

_ACTIVATION_REGISTRY = LazyDict(
    {
        "gelu": lambda: GELU(),
        "gelu_fast": lambda: FastGELU(),
        "gelu_new": lambda: NewGELU(),
        "gelu_pytorch_tanh": lambda: GELU(
            approximate="tanh"
        ),
        "relu": lambda: ReLU(),
        "relu2": lambda: ReLUSquaredActivation(),
        "silu": lambda: SiLU(),
        "quick_gelu": lambda: QuickGELU(),
    }
)

FastGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_fast")
class FastGELU(CustomOp):

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_fast
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            self.op = ipex_ops.gelu_fast

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                           (1.0 + 0.044715 * x * x)))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        return self.op(x)

op `instance-attribute` ¶

op = gelu_fast

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_fast
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        self.op = ipex_ops.gelu_fast

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                       (1.0 + 0.044715 * x * x)))

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    return self.op(x)

FatreluAndMul ¶

Bases: CustomOp

An activation function for FATReLU.

The function computes x -> FATReLU(x[:d]) * x[d:] where d = x.shape[-1] // 2. This is used in openbmb/MiniCPM-S-1B-sft.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("fatrelu_and_mul")
class FatreluAndMul(CustomOp):
    """An activation function for FATReLU.

    The function computes x -> FATReLU(x[:d]) * x[d:] where
    d = x.shape[-1] // 2.
    This is used in openbmb/MiniCPM-S-1B-sft.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def __init__(self, threshold: float = 0.):
        super().__init__()
        self.threshold = threshold
        if current_platform.is_cuda_alike():
            self.op = torch.ops._C.fatrelu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        x1 = x[..., :d]
        x2 = x[..., d:]
        x1 = F.threshold(x1, self.threshold, 0.0)
        return x1 * x2

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x, self.threshold)
        return out

_forward_method `instance-attribute` ¶

_forward_method = forward_native

op `instance-attribute` ¶

op = fatrelu_and_mul

threshold `instance-attribute` ¶

threshold = threshold

init ¶

__init__(threshold: float = 0.0)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, threshold: float = 0.):
    super().__init__()
    self.threshold = threshold
    if current_platform.is_cuda_alike():
        self.op = torch.ops._C.fatrelu_and_mul
    elif current_platform.is_cpu():
        self._forward_method = self.forward_native

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x, self.threshold)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    x1 = x[..., :d]
    x2 = x[..., d:]
    x1 = F.threshold(x1, self.threshold, 0.0)
    return x1 * x2

GeluAndMul ¶

Bases: CustomOp

An activation function for GeGLU.

The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

Shapes

x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) return: (batch_size, seq_len, d) or (num_tokens, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_and_mul")
class GeluAndMul(CustomOp):
    """An activation function for GeGLU.

    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
        return: (batch_size, seq_len, d) or (num_tokens, d)
    """

    def __init__(self, approximate: str = "none"):
        super().__init__()
        self.approximate = approximate
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            if approximate == "none":
                self.op = torch.ops._C.gelu_and_mul
            elif approximate == "tanh":
                self.op = torch.ops._C.gelu_tanh_and_mul
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            if approximate == "none":
                self.op = ipex_ops.gelu_and_mul
            else:
                self.op = ipex_ops.gelu_tanh_and_mul

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def extra_repr(self) -> str:
        return f'approximate={repr(self.approximate)}'

approximate `instance-attribute` ¶

approximate = approximate

op `instance-attribute` ¶

op = gelu_and_mul

init ¶

__init__(approximate: str = 'none')

Source code in vllm/model_executor/layers/activation.py

def __init__(self, approximate: str = "none"):
    super().__init__()
    self.approximate = approximate
    if approximate not in ("none", "tanh"):
        raise ValueError(f"Unknown approximate mode: {approximate}")
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        if approximate == "none":
            self.op = torch.ops._C.gelu_and_mul
        elif approximate == "tanh":
            self.op = torch.ops._C.gelu_tanh_and_mul
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        if approximate == "none":
            self.op = ipex_ops.gelu_and_mul
        else:
            self.op = ipex_ops.gelu_tanh_and_mul

extra_repr ¶

extra_repr() -> str

Source code in vllm/model_executor/layers/activation.py

def extra_repr(self) -> str:
    return f'approximate={repr(self.approximate)}'

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

GeluAndMulSparse ¶

Bases: CustomOp

An activation function for GeluAndMulSparse. This activation function is used in Gemma3n. It computes: up_proj = self.up_proj(x) gate_proj = self.gate_proj(x) gate_proj = self._gaussian_topk(gate_proj) # sparsity activations = self.act_fn(gate_proj) # gelu down_proj = self.down_proj(activations * up_proj) Shapes: x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_and_mul_sparse")
class GeluAndMulSparse(CustomOp):
    """An activation function for GeluAndMulSparse.
    This activation function is used in Gemma3n. It computes:
        up_proj = self.up_proj(x)
        gate_proj = self.gate_proj(x)
        gate_proj = self._gaussian_topk(gate_proj) # sparsity
        activations = self.act_fn(gate_proj) # gelu
        down_proj = self.down_proj(activations * up_proj)
    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def __init__(self, activation_sparsity: float, approximate: str = "none"):
        super().__init__()
        # Gelu.
        self.approximate = approximate
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")

        # Sparsity.
        if activation_sparsity == 0.0:
            raise ValueError(
                "activation_sparsity is 0.0. Please use GeluAndMul.")
        target_sparsity_tensor = torch.tensor(activation_sparsity,
                                              dtype=torch.float32)
        normal_dist = torch.distributions.normal.Normal(0, 1)
        self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)

    def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor:
        """Get % sparse percentile of the Gaussian distribution."""
        # NOTE(rob): for TP>1, we could all-gather to get the means/std.
        # But we do not do this because in expectation they are the same
        # and in practice the eval scores are good without gathering.
        mean = torch.mean(x, dim=-1, keepdim=True)
        std = torch.std(x, dim=-1, keepdim=True, unbiased=False)
        cutoff_x = mean + std * self.std_multiplier
        return nn.functional.relu(x - cutoff_x)

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        out = self._gaussian_topk(x[..., :d])
        out = F.gelu(out, approximate=self.approximate)
        return out * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        return self.forward_native(x)

approximate `instance-attribute` ¶

approximate = approximate

std_multiplier `instance-attribute` ¶

std_multiplier = icdf(target_sparsity_tensor)

init ¶

__init__(
    activation_sparsity: float, approximate: str = "none"
)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, activation_sparsity: float, approximate: str = "none"):
    super().__init__()
    # Gelu.
    self.approximate = approximate
    if approximate not in ("none", "tanh"):
        raise ValueError(f"Unknown approximate mode: {approximate}")

    # Sparsity.
    if activation_sparsity == 0.0:
        raise ValueError(
            "activation_sparsity is 0.0. Please use GeluAndMul.")
    target_sparsity_tensor = torch.tensor(activation_sparsity,
                                          dtype=torch.float32)
    normal_dist = torch.distributions.normal.Normal(0, 1)
    self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)

_gaussian_topk ¶

_gaussian_topk(x: Tensor) -> Tensor

Get % sparse percentile of the Gaussian distribution.

Source code in vllm/model_executor/layers/activation.py

def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor:
    """Get % sparse percentile of the Gaussian distribution."""
    # NOTE(rob): for TP>1, we could all-gather to get the means/std.
    # But we do not do this because in expectation they are the same
    # and in practice the eval scores are good without gathering.
    mean = torch.mean(x, dim=-1, keepdim=True)
    std = torch.std(x, dim=-1, keepdim=True, unbiased=False)
    cutoff_x = mean + std * self.std_multiplier
    return nn.functional.relu(x - cutoff_x)

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    return self.forward_native(x)

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    out = self._gaussian_topk(x[..., :d])
    out = F.gelu(out, approximate=self.approximate)
    return out * x[..., d:]

MulAndSilu ¶

Bases: CustomOp

An activation function for SwiGLU.

The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("mul_and_silu")
class MulAndSilu(CustomOp):
    """An activation function for SwiGLU.

    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike():
            self.op = torch.ops._C.mul_and_silu
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            self.op = ipex_ops.silu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return x[..., :d] * F.silu(x[..., d:])

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

_forward_method `instance-attribute` ¶

_forward_method = forward_native

op `instance-attribute` ¶

op = mul_and_silu

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike():
        self.op = torch.ops._C.mul_and_silu
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        self.op = ipex_ops.silu_and_mul
    elif current_platform.is_cpu():
        self._forward_method = self.forward_native

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    return x[..., :d] * F.silu(x[..., d:])

NewGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_new")
class NewGELU(CustomOp):

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_new
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            self.op = ipex_ops.gelu_new

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
        return 0.5 * x * (1.0 + torch.tanh(c *
                                           (x + 0.044715 * torch.pow(x, 3.0))))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        return self.op(x)

op `instance-attribute` ¶

op = gelu_new

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_new
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        self.op = ipex_ops.gelu_new

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    c = math.sqrt(2.0 / math.pi)
    return 0.5 * x * (1.0 + torch.tanh(c *
                                       (x + 0.044715 * torch.pow(x, 3.0))))

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    return self.op(x)

QuickGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("quick_gelu")
class QuickGELU(CustomOp):
    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_quick
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            self.op = ipex_ops.gelu_quick

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return x * torch.sigmoid(1.702 * x)

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

op `instance-attribute` ¶

op = gelu_quick

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_quick
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        self.op = ipex_ops.gelu_quick

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return x * torch.sigmoid(1.702 * x)

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

ReLUSquaredActivation ¶

Bases: CustomOp

Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("relu2")
class ReLUSquaredActivation(CustomOp):
    """
    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
    """

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return torch.square(F.relu(x))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        return self.forward_native(x)

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    return self.forward_native(x)

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return torch.square(F.relu(x))

ScaledActivation ¶

Bases: Module

An activation function with post-scale parameters.

This is used for some quantization methods like AWQ.

Source code in vllm/model_executor/layers/activation.py

class ScaledActivation(nn.Module):
    """An activation function with post-scale parameters.

    This is used for some quantization methods like AWQ.
    """

    def __init__(
        self,
        act_module: nn.Module,
        intermediate_size: int,
        input_is_parallel: bool = True,
        params_dtype: Optional[torch.dtype] = None,
    ):
        super().__init__()
        self.act = act_module
        self.input_is_parallel = input_is_parallel
        if input_is_parallel:
            tp_size = get_tensor_model_parallel_world_size()
            intermediate_size_per_partition = divide(intermediate_size,
                                                     tp_size)
        else:
            intermediate_size_per_partition = intermediate_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(x) / self.scales

    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
        param_data = param.data
        if self.input_is_parallel:
            tp_rank = get_tensor_model_parallel_rank()
            shard_size = param_data.shape[0]
            start_idx = tp_rank * shard_size
            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)

act `instance-attribute` ¶

act = act_module

input_is_parallel `instance-attribute` ¶

input_is_parallel = input_is_parallel

scales `instance-attribute` ¶

scales = Parameter(
    empty(
        intermediate_size_per_partition, dtype=params_dtype
    )
)

init ¶

__init__(
    act_module: Module,
    intermediate_size: int,
    input_is_parallel: bool = True,
    params_dtype: Optional[dtype] = None,
)

Source code in vllm/model_executor/layers/activation.py

def __init__(
    self,
    act_module: nn.Module,
    intermediate_size: int,
    input_is_parallel: bool = True,
    params_dtype: Optional[torch.dtype] = None,
):
    super().__init__()
    self.act = act_module
    self.input_is_parallel = input_is_parallel
    if input_is_parallel:
        tp_size = get_tensor_model_parallel_world_size()
        intermediate_size_per_partition = divide(intermediate_size,
                                                 tp_size)
    else:
        intermediate_size_per_partition = intermediate_size
    if params_dtype is None:
        params_dtype = torch.get_default_dtype()
    self.scales = nn.Parameter(
        torch.empty(intermediate_size_per_partition, dtype=params_dtype))
    set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

forward ¶

forward(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.act(x) / self.scales

weight_loader ¶

weight_loader(param: Parameter, loaded_weight: Tensor)

Source code in vllm/model_executor/layers/activation.py

def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
    param_data = param.data
    if self.input_is_parallel:
        tp_rank = get_tensor_model_parallel_rank()
        shard_size = param_data.shape[0]
        start_idx = tp_rank * shard_size
        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
    assert param_data.shape == loaded_weight.shape
    param_data.copy_(loaded_weight)

SiluAndMul ¶

Bases: CustomOp

An activation function for SwiGLU.

The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("silu_and_mul")
class SiluAndMul(CustomOp):
    """An activation function for SwiGLU.

    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.silu_and_mul
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
            self.op = ipex_ops.silu_and_mul

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.silu(x[..., :d]) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        x_reshaped = x.view(-1, x.shape[-1])
        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
        result = s * x_reshaped[:, d:]
        return result.view(*x.shape[:-1], d)

op `instance-attribute` ¶

op = silu_and_mul

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.silu_and_mul
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops
        self.op = ipex_ops.silu_and_mul

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    return F.silu(x[..., :d]) * x[..., d:]

forward_neuron ¶

forward_neuron(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    x_reshaped = x.view(-1, x.shape[-1])
    s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
    result = s * x_reshaped[:, d:]
    return result.view(*x.shape[:-1], d)

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = (x.shape[:-1] + (d, ))
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

get_act_and_mul_fn ¶

get_act_and_mul_fn(act_fn_name: str) -> Module

Get an activation-and-mul (i.e. SiluAndMul) function by name.

Source code in vllm/model_executor/layers/activation.py

def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
    act_fn_name = act_fn_name.lower()
    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
        raise ValueError(
            f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]

get_act_fn ¶

get_act_fn(act_fn_name: str) -> Module

Get an activation function by name.

Source code in vllm/model_executor/layers/activation.py

def get_act_fn(act_fn_name: str) -> nn.Module:
    """Get an activation function by name."""
    act_fn_name = act_fn_name.lower()
    if act_fn_name not in _ACTIVATION_REGISTRY:
        raise ValueError(
            f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_REGISTRY[act_fn_name]

vllm.model_executor.layers.activation

_ACTIVATION_AND_MUL_REGISTRY module-attribute ¶

_ACTIVATION_REGISTRY module-attribute ¶

FastGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

FatreluAndMul ¶

_forward_method instance-attribute ¶

op instance-attribute ¶

threshold instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

GeluAndMul ¶

approximate instance-attribute ¶

op instance-attribute ¶

__init__ ¶

extra_repr ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

GeluAndMulSparse ¶

approximate instance-attribute ¶

std_multiplier instance-attribute ¶

__init__ ¶

_gaussian_topk ¶

forward_cuda ¶

forward_native ¶

MulAndSilu ¶

_forward_method instance-attribute ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

NewGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

QuickGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

ReLUSquaredActivation ¶

forward_cuda ¶

forward_native ¶

ScaledActivation ¶

act instance-attribute ¶

input_is_parallel instance-attribute ¶

scales instance-attribute ¶

__init__ ¶

forward ¶

weight_loader ¶

SiluAndMul ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_neuron ¶

forward_xpu ¶

get_act_and_mul_fn ¶

get_act_fn ¶

_ACTIVATION_AND_MUL_REGISTRY `module-attribute` ¶

_ACTIVATION_REGISTRY `module-attribute` ¶

op `instance-attribute` ¶

init ¶

_forward_method `instance-attribute` ¶

op `instance-attribute` ¶

threshold `instance-attribute` ¶

init ¶

approximate `instance-attribute` ¶

op `instance-attribute` ¶

init ¶

approximate `instance-attribute` ¶

std_multiplier `instance-attribute` ¶

init ¶

_forward_method `instance-attribute` ¶

op `instance-attribute` ¶

init ¶

op `instance-attribute` ¶

init ¶

op `instance-attribute` ¶

init ¶

act `instance-attribute` ¶

input_is_parallel `instance-attribute` ¶

scales `instance-attribute` ¶

init ¶

op `instance-attribute` ¶

init ¶