vllm.model_executor.models.glm4
Inference-only GLM-4-0414 model compatible with HuggingFace weights.
ALL_DECODER_LAYER_TYPES
module-attribute
¶
ALL_DECODER_LAYER_TYPES = {'attention': Glm4DecoderLayer}
Glm4Attention
¶
Bases: Module
Source code in vllm/model_executor/models/glm4.py
attn
instance-attribute
¶
attn = Attention(
num_heads,
head_dim,
scaling,
num_kv_heads=num_kv_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
attn_type=attn_type,
)
o_proj
instance-attribute
¶
o_proj = RowParallelLinear(
total_num_heads * head_dim,
hidden_size,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.o_proj",
)
qkv_proj
instance-attribute
¶
qkv_proj = QKVParallelLinear(
hidden_size,
head_dim,
total_num_heads,
total_num_kv_heads,
bias=qkv_bias,
quant_config=quant_config,
prefix=f"{prefix}.qkv_proj",
)
rotary_emb
instance-attribute
¶
rotary_emb = get_rope(
head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=rope_theta,
rope_scaling=rope_scaling,
partial_rotary_factor=partial_rotary_factor,
is_neox_style=False,
)
__init__
¶
__init__(
config: Glm4Config,
hidden_size: int,
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
head_dim: Optional[int] = None,
qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
rope_scaling: Optional[tuple] = None,
prefix: str = "",
attn_type: str = DECODER,
) -> None
Source code in vllm/model_executor/models/glm4.py
forward
¶
Source code in vllm/model_executor/models/glm4.py
Glm4DecoderLayer
¶
Bases: Module
Source code in vllm/model_executor/models/glm4.py
mlp
instance-attribute
¶
mlp = LlamaMLP(
hidden_size=hidden_size,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
quant_config=quant_config,
prefix=f"{prefix}.mlp",
)
post_attention_layernorm
instance-attribute
¶
post_attention_layernorm = RMSNorm(
hidden_size, eps=rms_norm_eps
)
post_self_attn_layernorm
instance-attribute
¶
post_self_attn_layernorm = RMSNorm(
hidden_size, eps=rms_norm_eps
)
self_attn
instance-attribute
¶
self_attn = Glm4Attention(
config=config,
hidden_size=hidden_size,
num_heads=num_attention_heads,
max_position=max_position_embeddings,
num_kv_heads=num_key_value_heads,
rope_theta=rope_theta,
qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None),
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn",
attn_type=DECODER,
)
__init__
¶
__init__(
config: Glm4Config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None
Source code in vllm/model_executor/models/glm4.py
forward
¶
forward(
positions: Tensor,
hidden_states: Tensor,
residual: Optional[Tensor],
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/models/glm4.py
Glm4ForCausalLM
¶
Bases: Module
, SupportsLoRA
, SupportsPP
Source code in vllm/model_executor/models/glm4.py
make_empty_intermediate_tensors
instance-attribute
¶
model
instance-attribute
¶
model = Glm4Model(
vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"),
)
packed_modules_mapping
class-attribute
instance-attribute
¶
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
}
__init__
¶
__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/glm4.py
compute_logits
¶
compute_logits(
hidden_states: Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
forward
¶
forward(
input_ids: Tensor,
positions: Tensor,
intermediate_tensors: Optional[
IntermediateTensors
] = None,
inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/glm4.py
get_input_embeddings
¶
load_weights
¶
Source code in vllm/model_executor/models/glm4.py
Glm4Model
¶
Bases: LlamaModel