Skip to content

vllm.model_executor.models.glm4v

Inference-only CogAgent model compatible with THUDM weights.

EVA2CLIPAttention

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPAttention(nn.Module):

    def __init__(
        self,
        config,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.tp_size = get_tensor_model_parallel_world_size()
        self.num_heads_per_rank = config.num_heads // self.tp_size
        self.head_dim = config.hidden_size // config.num_heads
        self.scale = self.head_dim**-0.5

        self.query_key_value = QKVParallelLinear(
            config.hidden_size,
            self.head_dim,
            config.num_heads,
            quant_config=quant_config,
            prefix=f"{prefix}.query_key_value",
        )
        self.dense = RowParallelLinear(
            config.hidden_size,
            config.hidden_size,
            quant_config=quant_config,
            prefix=f"{prefix}.dense",
        )

        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
                                       self.scale)
        self.output_dropout = torch.nn.Dropout(config.dropout_prob)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
        q, k, v = qkv.chunk(3, dim=-1)

        out = self.attn(q, k, v)
        output, _ = self.dense(out)
        output = self.output_dropout(output)
        return output

attn instance-attribute

attn = MultiHeadAttention(
    num_heads_per_rank, head_dim, scale
)

dense instance-attribute

dense = RowParallelLinear(
    hidden_size,
    hidden_size,
    quant_config=quant_config,
    prefix=f"{prefix}.dense",
)

head_dim instance-attribute

head_dim = hidden_size // num_heads

hidden_size instance-attribute

hidden_size = hidden_size

num_heads_per_rank instance-attribute

num_heads_per_rank = num_heads // tp_size

output_dropout instance-attribute

output_dropout = Dropout(dropout_prob)

query_key_value instance-attribute

query_key_value = QKVParallelLinear(
    hidden_size,
    head_dim,
    num_heads,
    quant_config=quant_config,
    prefix=f"{prefix}.query_key_value",
)

scale instance-attribute

scale = head_dim ** -0.5

tp_size instance-attribute

__init__

__init__(
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    super().__init__()
    self.hidden_size = config.hidden_size
    self.tp_size = get_tensor_model_parallel_world_size()
    self.num_heads_per_rank = config.num_heads // self.tp_size
    self.head_dim = config.hidden_size // config.num_heads
    self.scale = self.head_dim**-0.5

    self.query_key_value = QKVParallelLinear(
        config.hidden_size,
        self.head_dim,
        config.num_heads,
        quant_config=quant_config,
        prefix=f"{prefix}.query_key_value",
    )
    self.dense = RowParallelLinear(
        config.hidden_size,
        config.hidden_size,
        quant_config=quant_config,
        prefix=f"{prefix}.dense",
    )

    self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
                                   self.scale)
    self.output_dropout = torch.nn.Dropout(config.dropout_prob)

forward

forward(x: Tensor) -> Tensor
Source code in vllm/model_executor/models/glm4v.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
    q, k, v = qkv.chunk(3, dim=-1)

    out = self.attn(q, k, v)
    output, _ = self.dense(out)
    output = self.output_dropout(output)
    return output

EVA2CLIPGLU

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPGLU(nn.Module):

    def __init__(
        self,
        config,
        in_features,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        """
        The original implementation is the same as:
        ```python
        self.dense_h_to_4h = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config
        )

        self.gate_proj = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config
        )
        ```
        ```
        gate_proj_output, _ = self.gate_proj(x)
        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
        ```

        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
        ```
        self.merged_proj = MergedColumnParallelLinear(
            config.hidden_size,
            [config.ffn_hidden_size] * 2,
            bias=False,
            quant_config=quant_config
        )
        ```
        ```
        x, _ = self.merged_proj(x)
        ```
        """
        super().__init__()
        self.linear_proj = ReplicatedLinear(in_features,
                                            config.hidden_size,
                                            bias=False,
                                            quant_config=quant_config,
                                            prefix=f"{prefix}.linear_proj")
        self.norm1 = nn.LayerNorm(config.hidden_size)
        self.act1 = nn.GELU()
        self.act2 = SiluAndMul()

        self.merged_proj = MergedColumnParallelLinear(
            config.hidden_size, [config.ffn_hidden_size] * 2,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.merged_proj")

        self.dense_4h_to_h = RowParallelLinear(
            config.ffn_hidden_size,
            config.hidden_size,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.dense_4h_to_h")

    def forward(self, x):
        x, _ = self.linear_proj(x)
        x = self.act1(self.norm1(x))
        x, _ = self.merged_proj(x)
        x = self.act2(x)
        x, _ = self.dense_4h_to_h(x)
        return x

act1 instance-attribute

act1 = GELU()

act2 instance-attribute

act2 = SiluAndMul()

dense_4h_to_h instance-attribute

dense_4h_to_h = RowParallelLinear(
    ffn_hidden_size,
    hidden_size,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.dense_4h_to_h",
)

linear_proj instance-attribute

linear_proj = ReplicatedLinear(
    in_features,
    hidden_size,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.linear_proj",
)

merged_proj instance-attribute

merged_proj = MergedColumnParallelLinear(
    hidden_size,
    [ffn_hidden_size] * 2,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.merged_proj",
)

norm1 instance-attribute

norm1 = LayerNorm(hidden_size)

__init__

__init__(
    config,
    in_features,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)

The original implementation is the same as:

self.dense_h_to_4h = ColumnParallelLinear(
    config.hidden_size,
    config.ffn_hidden_size,
    bias=False,
    quant_config=quant_config
)

self.gate_proj = ColumnParallelLinear(
    config.hidden_size,
    config.ffn_hidden_size,
    bias=False,
    quant_config=quant_config
)
gate_proj_output, _ = self.gate_proj(x)
dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)

We merge two ColumnParallelLinear into one MergedColumnParallelLinear:

self.merged_proj = MergedColumnParallelLinear(
    config.hidden_size,
    [config.ffn_hidden_size] * 2,
    bias=False,
    quant_config=quant_config
)
x, _ = self.merged_proj(x)

Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    in_features,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    """
    The original implementation is the same as:
    ```python
    self.dense_h_to_4h = ColumnParallelLinear(
        config.hidden_size,
        config.ffn_hidden_size,
        bias=False,
        quant_config=quant_config
    )

    self.gate_proj = ColumnParallelLinear(
        config.hidden_size,
        config.ffn_hidden_size,
        bias=False,
        quant_config=quant_config
    )
    ```
    ```
    gate_proj_output, _ = self.gate_proj(x)
    dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
    x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
    ```

    We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
    ```
    self.merged_proj = MergedColumnParallelLinear(
        config.hidden_size,
        [config.ffn_hidden_size] * 2,
        bias=False,
        quant_config=quant_config
    )
    ```
    ```
    x, _ = self.merged_proj(x)
    ```
    """
    super().__init__()
    self.linear_proj = ReplicatedLinear(in_features,
                                        config.hidden_size,
                                        bias=False,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.linear_proj")
    self.norm1 = nn.LayerNorm(config.hidden_size)
    self.act1 = nn.GELU()
    self.act2 = SiluAndMul()

    self.merged_proj = MergedColumnParallelLinear(
        config.hidden_size, [config.ffn_hidden_size] * 2,
        bias=False,
        quant_config=quant_config,
        prefix=f"{prefix}.merged_proj")

    self.dense_4h_to_h = RowParallelLinear(
        config.ffn_hidden_size,
        config.hidden_size,
        bias=False,
        quant_config=quant_config,
        prefix=f"{prefix}.dense_4h_to_h")

forward

forward(x)
Source code in vllm/model_executor/models/glm4v.py
def forward(self, x):
    x, _ = self.linear_proj(x)
    x = self.act1(self.norm1(x))
    x, _ = self.merged_proj(x)
    x = self.act2(x)
    x, _ = self.dense_4h_to_h(x)
    return x

EVA2CLIPMLP

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPMLP(nn.Module):

    def __init__(
        self,
        config,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        super().__init__()
        self.config = config
        self.activation_fn = get_act_fn(config.hidden_act)
        self.fc1 = ColumnParallelLinear(
            config.hidden_size,
            config.intermediate_size,
            quant_config=quant_config,
            prefix=f"{prefix}.fc1",
        )
        self.fc2 = RowParallelLinear(
            config.intermediate_size,
            config.hidden_size,
            quant_config=quant_config,
            prefix=f"{prefix}.fc2",
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x, _ = self.fc1(x)
        x = self.activation_fn(x)
        x, _ = self.fc2(x)
        return x

activation_fn instance-attribute

activation_fn = get_act_fn(hidden_act)

config instance-attribute

config = config

fc1 instance-attribute

fc1 = ColumnParallelLinear(
    hidden_size,
    intermediate_size,
    quant_config=quant_config,
    prefix=f"{prefix}.fc1",
)

fc2 instance-attribute

fc2 = RowParallelLinear(
    intermediate_size,
    hidden_size,
    quant_config=quant_config,
    prefix=f"{prefix}.fc2",
)

__init__

__init__(
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    super().__init__()
    self.config = config
    self.activation_fn = get_act_fn(config.hidden_act)
    self.fc1 = ColumnParallelLinear(
        config.hidden_size,
        config.intermediate_size,
        quant_config=quant_config,
        prefix=f"{prefix}.fc1",
    )
    self.fc2 = RowParallelLinear(
        config.intermediate_size,
        config.hidden_size,
        quant_config=quant_config,
        prefix=f"{prefix}.fc2",
    )

forward

forward(x: Tensor) -> Tensor
Source code in vllm/model_executor/models/glm4v.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    x, _ = self.fc1(x)
    x = self.activation_fn(x)
    x, _ = self.fc2(x)
    return x

EVA2CLIPModel

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPModel(nn.Module):

    def __init__(
        self,
        config,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        super().__init__()
        vision_config = Namespace(**config.vision_config)
        self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
        self.transformer = EVA2CLIPTransformer(vision_config,
                                               quant_config=quant_config,
                                               prefix=f"{prefix}.transformer")
        self.linear_proj = EVA2CLIPGLU(config,
                                       in_features=config.hidden_size,
                                       quant_config=quant_config,
                                       prefix=f"{prefix}.linear_proj")
        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
                              out_channels=config.hidden_size,
                              kernel_size=2,
                              stride=2)
        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.scaling_factor = vision_config.scaling_factor

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        """
        Parameters:
        images : torch.Tensor
            Input image tensor with shape (B, C, H, W)

        Returns:
        torch.Tensor
            Transformed tensor with shape (B, L, D)
        """
        x = self.patch_embedding(images)
        x = self.transformer(x)
        x = x[:, 1:]

        b, s, h = x.shape
        grid_size = int(s**0.5)
        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
        x = self.conv(x)

        x = x.flatten(2).transpose(1, 2)
        x = self.linear_proj(x)
        boi = self.boi.expand(x.shape[0], -1, -1)
        eoi = self.eoi.expand(x.shape[0], -1, -1)
        x = torch.cat((boi, x, eoi), dim=1)
        x = x / self.scaling_factor
        return x

boi instance-attribute

boi = Parameter(zeros(1, 1, hidden_size))

conv instance-attribute

conv = Conv2d(
    in_channels=hidden_size,
    out_channels=hidden_size,
    kernel_size=2,
    stride=2,
)

eoi instance-attribute

eoi = Parameter(zeros(1, 1, hidden_size))

linear_proj instance-attribute

linear_proj = EVA2CLIPGLU(
    config,
    in_features=hidden_size,
    quant_config=quant_config,
    prefix=f"{prefix}.linear_proj",
)

patch_embedding instance-attribute

patch_embedding = EVA2CLIPPatchEmbedding(vision_config)

scaling_factor instance-attribute

scaling_factor = scaling_factor

transformer instance-attribute

transformer = EVA2CLIPTransformer(
    vision_config,
    quant_config=quant_config,
    prefix=f"{prefix}.transformer",
)

__init__

__init__(
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    super().__init__()
    vision_config = Namespace(**config.vision_config)
    self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
    self.transformer = EVA2CLIPTransformer(vision_config,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.transformer")
    self.linear_proj = EVA2CLIPGLU(config,
                                   in_features=config.hidden_size,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.linear_proj")
    self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
                          out_channels=config.hidden_size,
                          kernel_size=2,
                          stride=2)
    self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
    self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
    self.scaling_factor = vision_config.scaling_factor

forward

forward(images: Tensor) -> Tensor

images : torch.Tensor Input image tensor with shape (B, C, H, W)

torch.Tensor Transformed tensor with shape (B, L, D)

Source code in vllm/model_executor/models/glm4v.py
def forward(self, images: torch.Tensor) -> torch.Tensor:
    """
    Parameters:
    images : torch.Tensor
        Input image tensor with shape (B, C, H, W)

    Returns:
    torch.Tensor
        Transformed tensor with shape (B, L, D)
    """
    x = self.patch_embedding(images)
    x = self.transformer(x)
    x = x[:, 1:]

    b, s, h = x.shape
    grid_size = int(s**0.5)
    x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
    x = self.conv(x)

    x = x.flatten(2).transpose(1, 2)
    x = self.linear_proj(x)
    boi = self.boi.expand(x.shape[0], -1, -1)
    eoi = self.eoi.expand(x.shape[0], -1, -1)
    x = torch.cat((boi, x, eoi), dim=1)
    x = x / self.scaling_factor
    return x

EVA2CLIPPatchEmbedding

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPPatchEmbedding(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.proj = nn.Conv2d(config.in_channels,
                              config.hidden_size,
                              kernel_size=config.patch_size,
                              stride=config.patch_size)
        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
        self.position_embedding = nn.Embedding(config.num_positions,
                                               config.hidden_size)

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        """
        Parameters:
        images : torch.Tensor
            Input image tensor with shape (B, C, H, W)

        Returns:
        torch.Tensor
            Transformed tensor with shape (B, L, D)
        """
        images = images.to(device=self.proj.weight.device,
                           dtype=self.proj.weight.dtype)
        x = self.proj(images)
        x = x.flatten(2).transpose(1, 2)
        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        x += self.position_embedding.weight.unsqueeze(0)
        return x

cls_embedding instance-attribute

cls_embedding = Parameter(zeros(1, hidden_size))

position_embedding instance-attribute

position_embedding = Embedding(num_positions, hidden_size)

proj instance-attribute

proj = Conv2d(
    in_channels,
    hidden_size,
    kernel_size=patch_size,
    stride=patch_size,
)

__init__

__init__(config)
Source code in vllm/model_executor/models/glm4v.py
def __init__(self, config):
    super().__init__()
    self.proj = nn.Conv2d(config.in_channels,
                          config.hidden_size,
                          kernel_size=config.patch_size,
                          stride=config.patch_size)
    self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
    self.position_embedding = nn.Embedding(config.num_positions,
                                           config.hidden_size)

forward

forward(images: Tensor) -> Tensor

images : torch.Tensor Input image tensor with shape (B, C, H, W)

torch.Tensor Transformed tensor with shape (B, L, D)

Source code in vllm/model_executor/models/glm4v.py
def forward(self, images: torch.Tensor) -> torch.Tensor:
    """
    Parameters:
    images : torch.Tensor
        Input image tensor with shape (B, C, H, W)

    Returns:
    torch.Tensor
        Transformed tensor with shape (B, L, D)
    """
    images = images.to(device=self.proj.weight.device,
                       dtype=self.proj.weight.dtype)
    x = self.proj(images)
    x = x.flatten(2).transpose(1, 2)
    cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
    x = torch.cat((cls_token, x), dim=1)
    x += self.position_embedding.weight.unsqueeze(0)
    return x

EVA2CLIPTransformer

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPTransformer(nn.Module):

    def __init__(
        self,
        config,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        super().__init__()
        self.layers = nn.ModuleList([
            EVA2CLIPTransformerLayer(config,
                                     quant_config=quant_config,
                                     prefix=f"{prefix}.layers.{layer_idx}")
            for layer_idx in range(config.num_hidden_layers)
        ])

    def forward(self, hidden_states):
        for layer_module in self.layers:
            hidden_states = layer_module(hidden_states)
        return hidden_states

layers instance-attribute

layers = ModuleList(
    [
        EVA2CLIPTransformerLayer(
            config,
            quant_config=quant_config,
            prefix=f"{prefix}.layers.{layer_idx}",
        )
        for layer_idx in range(num_hidden_layers)
    ]
)

__init__

__init__(
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    super().__init__()
    self.layers = nn.ModuleList([
        EVA2CLIPTransformerLayer(config,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.layers.{layer_idx}")
        for layer_idx in range(config.num_hidden_layers)
    ])

forward

forward(hidden_states)
Source code in vllm/model_executor/models/glm4v.py
def forward(self, hidden_states):
    for layer_module in self.layers:
        hidden_states = layer_module(hidden_states)
    return hidden_states

EVA2CLIPTransformerLayer

Bases: Module

Source code in vllm/model_executor/models/glm4v.py
class EVA2CLIPTransformerLayer(nn.Module):

    def __init__(
        self,
        config,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = '',
    ):
        super().__init__()
        self.input_layernorm = LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
        self.attention = EVA2CLIPAttention(config,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.attention")
        self.mlp = EVA2CLIPMLP(config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.mlp")
        self.post_attention_layernorm = LayerNorm(config.hidden_size,
                                                  eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        attention_input = hidden_states
        attention_output = self.input_layernorm(
            self.attention(attention_input))
        hidden_states = attention_input + attention_output
        mlp_input = hidden_states
        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
        output = mlp_input + mlp_output
        return output

attention instance-attribute

attention = EVA2CLIPAttention(
    config,
    quant_config=quant_config,
    prefix=f"{prefix}.attention",
)

input_layernorm instance-attribute

input_layernorm = LayerNorm(hidden_size, eps=layer_norm_eps)

mlp instance-attribute

mlp = EVA2CLIPMLP(
    config,
    quant_config=quant_config,
    prefix=f"{prefix}.mlp",
)

post_attention_layernorm instance-attribute

post_attention_layernorm = LayerNorm(
    hidden_size, eps=layer_norm_eps
)

__init__

__init__(
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = '',
):
    super().__init__()
    self.input_layernorm = LayerNorm(config.hidden_size,
                                     eps=config.layer_norm_eps)
    self.attention = EVA2CLIPAttention(config,
                                       quant_config=quant_config,
                                       prefix=f"{prefix}.attention")
    self.mlp = EVA2CLIPMLP(config,
                           quant_config=quant_config,
                           prefix=f"{prefix}.mlp")
    self.post_attention_layernorm = LayerNorm(config.hidden_size,
                                              eps=config.layer_norm_eps)

forward

forward(hidden_states)
Source code in vllm/model_executor/models/glm4v.py
def forward(self, hidden_states):
    attention_input = hidden_states
    attention_output = self.input_layernorm(
        self.attention(attention_input))
    hidden_states = attention_input + attention_output
    mlp_input = hidden_states
    mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
    output = mlp_input + mlp_output
    return output

GLM4VDummyInputsBuilder

Bases: BaseDummyInputsBuilder[GLM4VProcessingInfo]

Source code in vllm/model_executor/models/glm4v.py
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"

        return base_text * num_images

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        hf_config = self.info.get_hf_config()
        vision_config = hf_config.vision_config

        target_width = target_height = vision_config["image_size"]
        num_images = mm_counts.get("image", 0)

        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict
Source code in vllm/model_executor/models/glm4v.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    hf_config = self.info.get_hf_config()
    vision_config = hf_config.vision_config

    target_width = target_height = vision_config["image_size"]
    num_images = mm_counts.get("image", 0)

    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/glm4v.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)

    base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"

    return base_text * num_images

GLM4VForCausalLM

Bases: ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsMultiModal

Source code in vllm/model_executor/models/glm4v.py
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
                                        info=GLM4VProcessingInfo,
                                        dummy_inputs=GLM4VDummyInputsBuilder)
class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                       SupportsMultiModal):

    packed_modules_mapping = {
        "query_key_value": ["query_key_value"],
        "dense_h_to_4h": ["dense_h_to_4h"],
        "merged_proj": ["gate_proj", "dense_h_to_4h"]
    }

    def get_mm_mapping(self) -> MultiModelKeys:
        """
        Get the module prefix in multimodal models
        """
        return MultiModelKeys.from_string_field(
            language_model="transformer.encoder",
            connector="transformer.vision.linear_proj",
            tower_model="transformer.vision.transformer")

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("image"):
            return "<|begin_of_image|><|endoftext|><|end_of_image|>"

        raise ValueError("Only image modality is supported")

    def __init__(
        self,
        *,
        vllm_config: VllmConfig,
        prefix: str = "",
        transformer_type: type[GLM4VModel] = GLM4VModel,
    ) -> None:
        super().__init__(
            vllm_config=vllm_config,
            prefix=prefix,
            transformer_type=transformer_type,
        )

        self.transformer: GLM4VModel

    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
        h = w = self.config.vision_config["image_size"]
        expected_dims = (3, h, w)
        actual_dims = tuple(data.shape[1:])

        if actual_dims != expected_dims:
            expected_expr = ("batch_size", *map(str, expected_dims))
            raise ValueError(
                f"The expected shape of pixel values is {expected_expr}. "
                f"You supplied {tuple(data.shape)}.")

        return data

    def _parse_and_validate_image_input(
            self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
        pixel_values = kwargs.pop("pixel_values", None)

        if pixel_values is not None:
            if not isinstance(pixel_values, (torch.Tensor, list)):
                raise ValueError("Incorrect type of pixel values. "
                                 f"Got type: {type(pixel_values)}")

            return GLMVImagePixelInputs(
                type="pixel_values",
                data=self._validate_pixel_values(
                    flatten_bn(pixel_values, concat=True)),
            )

        return None

    def _process_image_input(
            self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)

        return self.transformer.vision(pixel_values)

    def get_language_model(self) -> torch.nn.Module:
        return self.transformer

    def get_multimodal_embeddings(self,
                                  **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return []

        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.transformer.get_input_embeddings(input_ids)

        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids=input_ids,
                inputs_embeds=inputs_embeds,
                multimodal_embeddings=multimodal_embeddings,
                placeholder_token_id=[
                    self.config.boi_token_id,
                    self.config.pad_token_id,
                    self.config.eoi_token_id,
                ],
            )

        return inputs_embeds

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if intermediate_tensors is not None:
            inputs_embeds = None

        # NOTE: In v1, inputs_embeds is always generated at model runner, this
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
            inputs_embeds = self.get_input_embeddings(input_ids,
                                                      vision_embeddings)
            input_ids = None

        hidden_states = self.transformer(input_ids, positions,
                                         intermediate_tensors, inputs_embeds)

        return hidden_states

packed_modules_mapping class-attribute instance-attribute

packed_modules_mapping = {
    "query_key_value": ["query_key_value"],
    "dense_h_to_4h": ["dense_h_to_4h"],
    "merged_proj": ["gate_proj", "dense_h_to_4h"],
}

transformer instance-attribute

transformer: GLM4VModel

__init__

__init__(
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    transformer_type: type[GLM4VModel] = GLM4VModel,
) -> None
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    transformer_type: type[GLM4VModel] = GLM4VModel,
) -> None:
    super().__init__(
        vllm_config=vllm_config,
        prefix=prefix,
        transformer_type=transformer_type,
    )

    self.transformer: GLM4VModel

_parse_and_validate_image_input

_parse_and_validate_image_input(
    **kwargs: object,
) -> Optional[GLMVImagePixelInputs]
Source code in vllm/model_executor/models/glm4v.py
def _parse_and_validate_image_input(
        self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
    pixel_values = kwargs.pop("pixel_values", None)

    if pixel_values is not None:
        if not isinstance(pixel_values, (torch.Tensor, list)):
            raise ValueError("Incorrect type of pixel values. "
                             f"Got type: {type(pixel_values)}")

        return GLMVImagePixelInputs(
            type="pixel_values",
            data=self._validate_pixel_values(
                flatten_bn(pixel_values, concat=True)),
        )

    return None

_process_image_input

_process_image_input(
    image_input: GLMVImagePixelInputs,
) -> Tensor
Source code in vllm/model_executor/models/glm4v.py
def _process_image_input(
        self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
    pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)

    return self.transformer.vision(pixel_values)

_validate_pixel_values

_validate_pixel_values(data: Tensor) -> Tensor
Source code in vllm/model_executor/models/glm4v.py
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
    h = w = self.config.vision_config["image_size"]
    expected_dims = (3, h, w)
    actual_dims = tuple(data.shape[1:])

    if actual_dims != expected_dims:
        expected_expr = ("batch_size", *map(str, expected_dims))
        raise ValueError(
            f"The expected shape of pixel values is {expected_expr}. "
            f"You supplied {tuple(data.shape)}.")

    return data

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs: object,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/glm4v.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
    if intermediate_tensors is not None:
        inputs_embeds = None

    # NOTE: In v1, inputs_embeds is always generated at model runner, this
    # condition is for v0 compatibility.
    elif inputs_embeds is None:
        vision_embeddings = self.get_multimodal_embeddings(**kwargs)
        inputs_embeds = self.get_input_embeddings(input_ids,
                                                  vision_embeddings)
        input_ids = None

    hidden_states = self.transformer(input_ids, positions,
                                     intermediate_tensors, inputs_embeds)

    return hidden_states

get_input_embeddings

get_input_embeddings(
    input_ids: Tensor,
    multimodal_embeddings: Optional[
        MultiModalEmbeddings
    ] = None,
) -> Tensor
Source code in vllm/model_executor/models/glm4v.py
def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
    inputs_embeds = self.transformer.get_input_embeddings(input_ids)

    if multimodal_embeddings is not None \
        and len(multimodal_embeddings) != 0:
        inputs_embeds = merge_multimodal_embeddings(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            multimodal_embeddings=multimodal_embeddings,
            placeholder_token_id=[
                self.config.boi_token_id,
                self.config.pad_token_id,
                self.config.eoi_token_id,
            ],
        )

    return inputs_embeds

get_language_model

get_language_model() -> Module
Source code in vllm/model_executor/models/glm4v.py
def get_language_model(self) -> torch.nn.Module:
    return self.transformer

get_mm_mapping

get_mm_mapping() -> MultiModelKeys

Get the module prefix in multimodal models

Source code in vllm/model_executor/models/glm4v.py
def get_mm_mapping(self) -> MultiModelKeys:
    """
    Get the module prefix in multimodal models
    """
    return MultiModelKeys.from_string_field(
        language_model="transformer.encoder",
        connector="transformer.vision.linear_proj",
        tower_model="transformer.vision.transformer")

get_multimodal_embeddings

get_multimodal_embeddings(
    **kwargs: object,
) -> MultiModalEmbeddings
Source code in vllm/model_executor/models/glm4v.py
def get_multimodal_embeddings(self,
                              **kwargs: object) -> MultiModalEmbeddings:
    image_input = self._parse_and_validate_image_input(**kwargs)
    if image_input is None:
        return []

    vision_embeddings = self._process_image_input(image_input)
    return vision_embeddings

get_placeholder_str classmethod

get_placeholder_str(modality: str, i: int) -> Optional[str]
Source code in vllm/model_executor/models/glm4v.py
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
    if modality.startswith("image"):
        return "<|begin_of_image|><|endoftext|><|end_of_image|>"

    raise ValueError("Only image modality is supported")

GLM4VModel

Bases: ChatGLMModel

Source code in vllm/model_executor/models/glm4v.py
class GLM4VModel(ChatGLMModel):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        quant_config = vllm_config.quant_config

        self.vision = EVA2CLIPModel(self.config,
                                    quant_config,
                                    prefix=f"{prefix}.vision")

vision instance-attribute

vision = EVA2CLIPModel(
    config, quant_config, prefix=f"{prefix}.vision"
)

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/glm4v.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__(vllm_config=vllm_config, prefix=prefix)

    quant_config = vllm_config.quant_config

    self.vision = EVA2CLIPModel(self.config,
                                quant_config,
                                prefix=f"{prefix}.vision")

GLM4VMultiModalProcessor

Bases: BaseMultiModalProcessor[GLM4VProcessingInfo]

Source code in vllm/model_executor/models/glm4v.py
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):

    def _hf_processor_applies_updates(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ) -> bool:
        return False

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(pixel_values=MultiModalFieldConfig.batched("image"))

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()

        boi_token_id = hf_config.boi_token_id
        image_token_id = hf_config.pad_token_id
        eoi_token_id = hf_config.eoi_token_id

        def get_replacement(item_idx: int):
            num_image_tokens = self.info.get_num_image_tokens()
            image_tokens = [image_token_id] * num_image_tokens

            return [boi_token_id] + image_tokens + [eoi_token_id]

        return [
            PromptReplacement(
                modality="image",
                target=[boi_token_id, image_token_id, eoi_token_id],
                replacement=get_replacement,
            ),
        ]

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]
Source code in vllm/model_executor/models/glm4v.py
def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    return dict(pixel_values=MultiModalFieldConfig.batched("image"))

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]
Source code in vllm/model_executor/models/glm4v.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
    hf_config = self.info.get_hf_config()

    boi_token_id = hf_config.boi_token_id
    image_token_id = hf_config.pad_token_id
    eoi_token_id = hf_config.eoi_token_id

    def get_replacement(item_idx: int):
        num_image_tokens = self.info.get_num_image_tokens()
        image_tokens = [image_token_id] * num_image_tokens

        return [boi_token_id] + image_tokens + [eoi_token_id]

    return [
        PromptReplacement(
            modality="image",
            target=[boi_token_id, image_token_id, eoi_token_id],
            replacement=get_replacement,
        ),
    ]

_hf_processor_applies_updates

_hf_processor_applies_updates(
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool
Source code in vllm/model_executor/models/glm4v.py
def _hf_processor_applies_updates(
    self,
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool:
    return False

GLM4VProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/glm4v.py
class GLM4VProcessingInfo(BaseProcessingInfo):

    def get_hf_config(self):
        return self.ctx.get_hf_config(ChatGLMConfig)

    def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
        return self.ctx.init_processor(
            GLM4VProcessor,
            config=self.get_hf_config(),
            tokenizer=self.get_tokenizer(),
            **kwargs,
        )

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        vision_config = hf_config.vision_config

        image_size = vision_config["image_size"]
        patch_size = vision_config["patch_size"]
        grid_length = image_size // patch_size // 2
        return grid_length * grid_length

    def get_num_image_feature_tokens(self) -> int:
        # EVA2CLIPModel has embeddings for boi and eoi tokens as well
        return self.get_num_image_tokens() + 2

get_hf_config

get_hf_config()
Source code in vllm/model_executor/models/glm4v.py
def get_hf_config(self):
    return self.ctx.get_hf_config(ChatGLMConfig)

get_hf_processor

get_hf_processor(**kwargs: object) -> GLM4VProcessor
Source code in vllm/model_executor/models/glm4v.py
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
    return self.ctx.init_processor(
        GLM4VProcessor,
        config=self.get_hf_config(),
        tokenizer=self.get_tokenizer(),
        **kwargs,
    )

get_num_image_feature_tokens

get_num_image_feature_tokens() -> int
Source code in vllm/model_executor/models/glm4v.py
def get_num_image_feature_tokens(self) -> int:
    # EVA2CLIPModel has embeddings for boi and eoi tokens as well
    return self.get_num_image_tokens() + 2

get_num_image_tokens

get_num_image_tokens() -> int
Source code in vllm/model_executor/models/glm4v.py
def get_num_image_tokens(self) -> int:
    hf_config = self.get_hf_config()
    vision_config = hf_config.vision_config

    image_size = vision_config["image_size"]
    patch_size = vision_config["patch_size"]
    grid_length = image_size // patch_size // 2
    return grid_length * grid_length

get_supported_mm_limits

get_supported_mm_limits() -> Mapping[str, Optional[int]]
Source code in vllm/model_executor/models/glm4v.py
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": 1}

GLM4VProcessor

This model doesn't define its own HF processor, so we implement our own one here.

Source code in vllm/model_executor/models/glm4v.py
class GLM4VProcessor:
    """
    This model doesn't define its own HF processor,
    so we implement our own one here.
    """

    def __init__(
        self,
        config: ChatGLMConfig,
        tokenizer: PreTrainedTokenizer,
    ) -> None:
        super().__init__()

        self.config = config
        self.tokenizer = tokenizer

        vision_config = config.vision_config
        image_size = vision_config["image_size"]

        self.image_transform = transforms.Compose([
            transforms.Resize(
                (image_size, image_size),
                interpolation=InterpolationMode.BICUBIC,
            ),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=(0.48145466, 0.4578275, 0.40821073),
                std=(0.26862954, 0.26130258, 0.27577711),
            ),
        ])

    def __call__(
        self,
        text: Optional[Union[TextInput, list[TextInput]]] = None,
        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchFeature:
        if text is None:
            text = []
        if not isinstance(text, list):
            text = [text]
        if images is None:
            images = []
        if not isinstance(images, list):
            images = [images]

        text_inputs = self.tokenizer(text)

        if len(images) == 0:
            image_inputs = {}
        else:
            pixel_values = [self.image_transform(image) for image in images]
            image_inputs = {"pixel_values": torch.stack(pixel_values)}

        return BatchFeature(
            {
                **text_inputs,
                **image_inputs,
            },
            tensor_type=return_tensors,
        )

config instance-attribute

config = config

image_transform instance-attribute

image_transform = Compose(
    [
        Resize(
            (image_size, image_size), interpolation=BICUBIC
        ),
        ToTensor(),
        Normalize(
            mean=(0.48145466, 0.4578275, 0.40821073),
            std=(0.26862954, 0.26130258, 0.27577711),
        ),
    ]
)

tokenizer instance-attribute

tokenizer = tokenizer

__call__

__call__(
    text: Optional[
        Union[TextInput, list[TextInput]]
    ] = None,
    images: Optional[
        Union[ImageInput, list[ImageInput]]
    ] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature
Source code in vllm/model_executor/models/glm4v.py
def __call__(
    self,
    text: Optional[Union[TextInput, list[TextInput]]] = None,
    images: Optional[Union[ImageInput, list[ImageInput]]] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
    if text is None:
        text = []
    if not isinstance(text, list):
        text = [text]
    if images is None:
        images = []
    if not isinstance(images, list):
        images = [images]

    text_inputs = self.tokenizer(text)

    if len(images) == 0:
        image_inputs = {}
    else:
        pixel_values = [self.image_transform(image) for image in images]
        image_inputs = {"pixel_values": torch.stack(pixel_values)}

    return BatchFeature(
        {
            **text_inputs,
            **image_inputs,
        },
        tensor_type=return_tensors,
    )

__init__

__init__(
    config: ChatGLMConfig, tokenizer: PreTrainedTokenizer
) -> None
Source code in vllm/model_executor/models/glm4v.py
def __init__(
    self,
    config: ChatGLMConfig,
    tokenizer: PreTrainedTokenizer,
) -> None:
    super().__init__()

    self.config = config
    self.tokenizer = tokenizer

    vision_config = config.vision_config
    image_size = vision_config["image_size"]

    self.image_transform = transforms.Compose([
        transforms.Resize(
            (image_size, image_size),
            interpolation=InterpolationMode.BICUBIC,
        ),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=(0.48145466, 0.4578275, 0.40821073),
            std=(0.26862954, 0.26130258, 0.27577711),
        ),
    ])

GLMVImagePixelInputs

Bases: TypedDict

Source code in vllm/model_executor/models/glm4v.py
class GLMVImagePixelInputs(TypedDict):
    type: Literal["pixel_values"]
    data: torch.Tensor
    """Shape: `(batch_size, num_channels, height, width)`"""

data instance-attribute

data: Tensor

Shape: (batch_size, num_channels, height, width)

type instance-attribute

type: Literal['pixel_values']