Skip to content

vllm.transformers_utils.configs

Modules:

Name Description
arctic

Arctic model configuration

chatglm
cohere2
dbrx

Dbrx configuration.

deepseek_vl2
eagle
exaone

Exaone model configuration

falcon

Falcon configuration

jais

JAIS configuration

kimi_vl
medusa
minimax_text_01

MiniMaxText01 model configuration

minimax_vl_01

MiniMaxVL01 model configuration

mllama
mlp_speculator
moonvit
mpt

A HuggingFace-style model configuration.

nemotron

Nemotron model configuration

nemotron_h

NemotronH model configuration

nvlm_d
ovis
skyworkr1v
solar

Solar model configuration

telechat2

Telechat configuration compatible with LlamaConfig.

ultravox

__all__ module-attribute

__all__ = [
    "ChatGLMConfig",
    "Cohere2Config",
    "DbrxConfig",
    "DeepseekVLV2Config",
    "MPTConfig",
    "RWConfig",
    "JAISConfig",
    "MedusaConfig",
    "EAGLEConfig",
    "ExaoneConfig",
    "MiniMaxText01Config",
    "MiniMaxVL01Config",
    "MllamaConfig",
    "MLPSpeculatorConfig",
    "MoonViTConfig",
    "KimiVLConfig",
    "NemotronConfig",
    "NemotronHConfig",
    "NVLM_D_Config",
    "OvisConfig",
    "SkyworkR1VChatConfig",
    "SolarConfig",
    "Telechat2Config",
    "UltravoxConfig",
]

ChatGLMConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/chatglm.py
class ChatGLMConfig(PretrainedConfig):
    model_type = "chatglm"
    attribute_map = {
        "num_hidden_layers": "num_layers",
        "n_head_kv": "multi_query_group_num",
    }

    def __init__(self,
                 num_layers=28,
                 padded_vocab_size=65024,
                 hidden_size=4096,
                 ffn_hidden_size=13696,
                 kv_channels=128,
                 num_attention_heads=32,
                 seq_length=2048,
                 hidden_dropout=0.0,
                 attention_dropout=0.0,
                 layernorm_epsilon=1e-5,
                 rmsnorm=True,
                 apply_residual_connection_post_layernorm=False,
                 post_layer_norm=True,
                 add_bias_linear=False,
                 add_qkv_bias=False,
                 interleaved_qkv=False,
                 bias_dropout_fusion=True,
                 multi_query_attention=False,
                 multi_query_group_num=1,
                 apply_query_key_layer_scaling=True,
                 attention_softmax_in_fp32=True,
                 fp32_residual_connection=False,
                 quantization_bit=0,
                 pre_seq_len=None,
                 prefix_projection=False,
                 **kwargs):
        self.num_layers = num_layers
        self.vocab_size = padded_vocab_size
        self.padded_vocab_size = padded_vocab_size
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
        self.kv_channels = kv_channels
        self.num_attention_heads = num_attention_heads
        self.seq_length = seq_length
        # It is to be compatible with long lora.
        self.max_position_embeddings = seq_length
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.layernorm_epsilon = layernorm_epsilon
        self.rmsnorm = rmsnorm
        self.apply_residual_connection_post_layernorm = (
            apply_residual_connection_post_layernorm)
        self.post_layer_norm = post_layer_norm
        self.add_bias_linear = add_bias_linear
        self.add_qkv_bias = add_qkv_bias
        self.bias_dropout_fusion = bias_dropout_fusion
        self.multi_query_attention = multi_query_attention
        self.multi_query_group_num = multi_query_group_num
        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.fp32_residual_connection = fp32_residual_connection
        self.quantization_bit = quantization_bit
        self.pre_seq_len = pre_seq_len
        self.prefix_projection = prefix_projection
        self.interleaved_qkv = interleaved_qkv
        super().__init__(**kwargs)

add_bias_linear instance-attribute

add_bias_linear = add_bias_linear

add_qkv_bias instance-attribute

add_qkv_bias = add_qkv_bias

apply_query_key_layer_scaling instance-attribute

apply_query_key_layer_scaling = (
    apply_query_key_layer_scaling
)

apply_residual_connection_post_layernorm instance-attribute

apply_residual_connection_post_layernorm = (
    apply_residual_connection_post_layernorm
)

attention_dropout instance-attribute

attention_dropout = attention_dropout

attention_softmax_in_fp32 instance-attribute

attention_softmax_in_fp32 = attention_softmax_in_fp32

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_hidden_layers": "num_layers",
    "n_head_kv": "multi_query_group_num",
}

bias_dropout_fusion instance-attribute

bias_dropout_fusion = bias_dropout_fusion

ffn_hidden_size instance-attribute

ffn_hidden_size = ffn_hidden_size

fp32_residual_connection instance-attribute

fp32_residual_connection = fp32_residual_connection

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size

interleaved_qkv instance-attribute

interleaved_qkv = interleaved_qkv

kv_channels instance-attribute

kv_channels = kv_channels

layernorm_epsilon instance-attribute

layernorm_epsilon = layernorm_epsilon

max_position_embeddings instance-attribute

max_position_embeddings = seq_length

model_type class-attribute instance-attribute

model_type = 'chatglm'

multi_query_attention instance-attribute

multi_query_attention = multi_query_attention

multi_query_group_num instance-attribute

multi_query_group_num = multi_query_group_num

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_layers instance-attribute

num_layers = num_layers

padded_vocab_size instance-attribute

padded_vocab_size = padded_vocab_size

post_layer_norm instance-attribute

post_layer_norm = post_layer_norm

pre_seq_len instance-attribute

pre_seq_len = pre_seq_len

prefix_projection instance-attribute

prefix_projection = prefix_projection

quantization_bit instance-attribute

quantization_bit = quantization_bit

rmsnorm instance-attribute

rmsnorm = rmsnorm

seq_length instance-attribute

seq_length = seq_length

vocab_size instance-attribute

vocab_size = padded_vocab_size

__init__

__init__(
    num_layers=28,
    padded_vocab_size=65024,
    hidden_size=4096,
    ffn_hidden_size=13696,
    kv_channels=128,
    num_attention_heads=32,
    seq_length=2048,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    layernorm_epsilon=1e-05,
    rmsnorm=True,
    apply_residual_connection_post_layernorm=False,
    post_layer_norm=True,
    add_bias_linear=False,
    add_qkv_bias=False,
    interleaved_qkv=False,
    bias_dropout_fusion=True,
    multi_query_attention=False,
    multi_query_group_num=1,
    apply_query_key_layer_scaling=True,
    attention_softmax_in_fp32=True,
    fp32_residual_connection=False,
    quantization_bit=0,
    pre_seq_len=None,
    prefix_projection=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/chatglm.py
def __init__(self,
             num_layers=28,
             padded_vocab_size=65024,
             hidden_size=4096,
             ffn_hidden_size=13696,
             kv_channels=128,
             num_attention_heads=32,
             seq_length=2048,
             hidden_dropout=0.0,
             attention_dropout=0.0,
             layernorm_epsilon=1e-5,
             rmsnorm=True,
             apply_residual_connection_post_layernorm=False,
             post_layer_norm=True,
             add_bias_linear=False,
             add_qkv_bias=False,
             interleaved_qkv=False,
             bias_dropout_fusion=True,
             multi_query_attention=False,
             multi_query_group_num=1,
             apply_query_key_layer_scaling=True,
             attention_softmax_in_fp32=True,
             fp32_residual_connection=False,
             quantization_bit=0,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs):
    self.num_layers = num_layers
    self.vocab_size = padded_vocab_size
    self.padded_vocab_size = padded_vocab_size
    self.hidden_size = hidden_size
    self.ffn_hidden_size = ffn_hidden_size
    self.kv_channels = kv_channels
    self.num_attention_heads = num_attention_heads
    self.seq_length = seq_length
    # It is to be compatible with long lora.
    self.max_position_embeddings = seq_length
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.layernorm_epsilon = layernorm_epsilon
    self.rmsnorm = rmsnorm
    self.apply_residual_connection_post_layernorm = (
        apply_residual_connection_post_layernorm)
    self.post_layer_norm = post_layer_norm
    self.add_bias_linear = add_bias_linear
    self.add_qkv_bias = add_qkv_bias
    self.bias_dropout_fusion = bias_dropout_fusion
    self.multi_query_attention = multi_query_attention
    self.multi_query_group_num = multi_query_group_num
    self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
    self.attention_softmax_in_fp32 = attention_softmax_in_fp32
    self.fp32_residual_connection = fp32_residual_connection
    self.quantization_bit = quantization_bit
    self.pre_seq_len = pre_seq_len
    self.prefix_projection = prefix_projection
    self.interleaved_qkv = interleaved_qkv
    super().__init__(**kwargs)

Cohere2Config

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [CohereModel]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information. Instantiating a configuration with the defaults will yield a similar configuration to that of the CohereForAI/c4ai-command-r-v01 model.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 256000

Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [CohereModel]

256000
hidden_size `int`, *optional*, defaults to 8192

Dimension of the hidden representations.

8192
intermediate_size `int`, *optional*, defaults to 22528

Dimension of the MLP representations.

22528
logit_scale `float`, *optional*, defaults to 0.0625

The scaling factor for the output logits.

0.0625
num_hidden_layers `int`, *optional*, defaults to 40

Number of hidden layers in the Transformer decoder.

40
num_attention_heads `int`, *optional*, defaults to 64

Number of attention heads for each attention layer in the Transformer decoder.

64
num_key_value_heads `int`, *optional*

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout this paper. If it is not specified, will default to num_attention_heads.

None
hidden_act `str` or `function`, *optional*, defaults to `"silu"`

The non-linear activation function (function or string) in the decoder.

'silu'
max_position_embeddings `int`, *optional*, defaults to 8192

The maximum sequence length that this model might ever be used with.

8192
initializer_range `float`, *optional*, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
layer_norm_eps `float`, *optional*, defaults to 1e-05

The epsilon used by the layer normalization.

1e-05
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

True
pad_token_id `int`, *optional*, defaults to 0

Padding token id.

0
bos_token_id `int`, *optional*, defaults to 5

Beginning of stream token id.

5
eos_token_id `int`, *optional*, defaults to 255001

End of stream token id.

255001
tie_word_embeddings `bool`, *optional*, defaults to `True`

Whether to tie weight embeddings

True
rope_theta `float`, *optional*, defaults to 10000.0

The base period of the RoPE embeddings.

10000.0
rope_scaling `dict`, *optional*

Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer max_position_embeddings, we recommend you to update this value accordingly. Expected contents: rope_type (str): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. factor (float, optional): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a factor of x will enable the model to handle sequences of length x * original maximum pre-trained length. original_max_position_embeddings (int, optional): Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during pretraining. attention_factor (float, optional): Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, using the factor field to infer the suggested value. beta_fast (float, optional): Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear ramp function. If unspecified, it defaults to 32. beta_slow (float, optional): Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear ramp function. If unspecified, it defaults to 1. short_factor (list[float], optional): Only used with 'longrope'. The scaling factor to be applied to short contexts (< original_max_position_embeddings). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 long_factor (list[float], optional): Only used with 'longrope'. The scaling factor to be applied to long contexts (< original_max_position_embeddings). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 low_freq_factor (float, optional): Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE high_freq_factor (float, optional): Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE

None
attention_bias `bool`, defaults to `False`, *optional*, defaults to `False`

Whether to use a bias in the query, key, value and output projection layers during self-attention.

False
attention_dropout `float`, *optional*, defaults to 0.0

The dropout ratio for the attention probabilities.

0.0
sliding_window `int`, *optional*, defaults to 4096

Size of the sliding window attention context.

4096
sliding_window_pattern `int`, *optional*, defaults to 4

Pattern for the sliding window attention.

4
cache_implementation `str`, *optional*, defaults to `"hybrid"`

the cache type to be used with generate.

'hybrid'
>>> from transformers import Cohere2Model, Cohere2Config

>>> # Initializing a Cohere Nextmodel configuration
>>> configuration = Cohere2Config()

>>> # Initializing a model from the Cohere2 configuration
>>> model = Cohere2Model(configuration) # doctest: +SKIP

>>> # Accessing the model configuration
>>> configuration = model.config # doctest: +SKIP
Source code in vllm/transformers_utils/configs/cohere2.py
class Cohere2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
    model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CohereModel`]
        hidden_size (`int`, *optional*, defaults to 8192):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22528):
            Dimension of the MLP representations.
        logit_scale (`float`, *optional*, defaults to 0.0625):
            The scaling factor for the output logits.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 64):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 5):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 255001):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        sliding_window (`int`, *optional*, defaults to 4096):
            Size of the sliding window attention context.
        sliding_window_pattern (`int`, *optional*, defaults to 4):
            Pattern for the sliding window attention.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Cohere2Model, Cohere2Config

    >>> # Initializing a Cohere Nextmodel configuration
    >>> configuration = Cohere2Config()

    >>> # Initializing a model from the Cohere2 configuration
    >>> model = Cohere2Model(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    """

    model_type = "cohere2"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=256000,
        hidden_size=8192,
        intermediate_size=22528,
        logit_scale=0.0625,
        num_hidden_layers=40,
        num_attention_heads=64,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=8192,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=5,
        eos_token_id=255001,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        sliding_window=4096,
        sliding_window_pattern=4,
        cache_implementation="hybrid",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.logit_scale = logit_scale
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.sliding_window = sliding_window
        self.sliding_window_pattern = sliding_window_pattern
        # Need to specify head_dim in the config so it can be used in the attention forward functions
        self.head_dim = hidden_size // num_attention_heads
        self.cache_implementation = cache_implementation

        # Validate the correctness of rotary position embeddings parameters
        rope_config_validation(self)

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

cache_implementation instance-attribute

cache_implementation = cache_implementation

head_dim instance-attribute

head_dim = hidden_size // num_attention_heads

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_eps instance-attribute

layer_norm_eps = layer_norm_eps

logit_scale instance-attribute

logit_scale = logit_scale

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

model_type class-attribute instance-attribute

model_type = 'cohere2'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

sliding_window instance-attribute

sliding_window = sliding_window

sliding_window_pattern instance-attribute

sliding_window_pattern = sliding_window_pattern

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=256000,
    hidden_size=8192,
    intermediate_size=22528,
    logit_scale=0.0625,
    num_hidden_layers=40,
    num_attention_heads=64,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=8192,
    initializer_range=0.02,
    layer_norm_eps=1e-05,
    use_cache=True,
    pad_token_id=0,
    bos_token_id=5,
    eos_token_id=255001,
    tie_word_embeddings=True,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    sliding_window=4096,
    sliding_window_pattern=4,
    cache_implementation="hybrid",
    **kwargs,
)
Source code in vllm/transformers_utils/configs/cohere2.py
def __init__(
    self,
    vocab_size=256000,
    hidden_size=8192,
    intermediate_size=22528,
    logit_scale=0.0625,
    num_hidden_layers=40,
    num_attention_heads=64,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=8192,
    initializer_range=0.02,
    layer_norm_eps=1e-5,
    use_cache=True,
    pad_token_id=0,
    bos_token_id=5,
    eos_token_id=255001,
    tie_word_embeddings=True,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    sliding_window=4096,
    sliding_window_pattern=4,
    cache_implementation="hybrid",
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.logit_scale = logit_scale
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.sliding_window = sliding_window
    self.sliding_window_pattern = sliding_window_pattern
    # Need to specify head_dim in the config so it can be used in the attention forward functions
    self.head_dim = hidden_size // num_attention_heads
    self.cache_implementation = cache_implementation

    # Validate the correctness of rotary position embeddings parameters
    rope_config_validation(self)

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

DbrxConfig

Bases: PretrainedConfig

Configuration class for Dbrx.

[DbrxModel]. It is used to instantiate a Dbrx model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
d_model `int`, *optional*, defaults to 6144

Dimensionality of the embeddings and hidden states.

2048
n_heads `int`, *optional*, defaults to 48

Number of attention heads for each attention layer in the Transformer encoder.

16
n_layers `int`, *optional*, defaults to 40

Number of hidden layers in the Transformer encoder.

24
max_seq_len `int`, *optional*, defaults to 32768

The maximum sequence length of the model.

2048
vocab_size `int`, *optional*, defaults to 100352

Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by the inputs_ids passed when calling [DbrxModel].

32000
resid_pdrop `float`, *optional*, defaults to 0.0

The dropout probability applied to the attention output before combining with residual.

0.0
emb_pdrop `float`, *optional*, defaults to 0.0

The dropout probability for the embedding layer.

0.0
attn_config `dict`, *optional*

A dictionary used to configure the model's attention module.

None
ffn_config `dict`, *optional*

A dictionary used to configure the model's FFN module.

None
use_cache `bool`, *optional*, defaults to `False`

Whether or not the model should return the last key/values attentions (not used by all models).

True
initializer_range `float`, *optional*, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
output_router_logits `bool`, *optional*, defaults to `False`

Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.

False
router_aux_loss_coef `float`, *optional*, defaults to 0.001

The aux loss factor for the total loss.

0.05

Example:

>>> from transformers import DbrxConfig, DbrxModel

>>> # Initializing a Dbrx configuration
>>> configuration = DbrxConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = DbrxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/dbrx.py
class DbrxConfig(PretrainedConfig):
    """Configuration class for Dbrx.

    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        d_model (`int`, *optional*, defaults to 6144):
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer encoder.
        max_seq_len (`int`, *optional*, defaults to 32768):
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 100352):
            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DbrxModel`].
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability applied to the attention output before combining with residual.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the embedding layer.
        attn_config (`dict`, *optional*):
            A dictionary used to configure the model's attention module.
        ffn_config (`dict`, *optional*):
            A dictionary used to configure the model's FFN module.
        use_cache (`bool`, *optional*, defaults to `False`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.


    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "dbrx"
    attribute_map = {
        "num_attention_heads": "n_heads",
        "hidden_size": "d_model",
        "num_hidden_layers": "n_layers",
        "max_position_embeddings": "max_seq_len",
    }

    def __init__(
        self,
        d_model: int = 2048,
        n_heads: int = 16,
        n_layers: int = 24,
        max_seq_len: int = 2048,
        vocab_size: int = 32000,
        resid_pdrop: float = 0.0,
        emb_pdrop: float = 0.0,
        attn_config: Optional[DbrxAttentionConfig] = None,
        ffn_config: Optional[DbrxFFNConfig] = None,
        use_cache: bool = True,
        initializer_range: float = 0.02,
        output_router_logits: bool = False,
        router_aux_loss_coef: float = 0.05,
        **kwargs: Any,
    ):
        if attn_config is None:
            self.attn_config = DbrxAttentionConfig()
        elif isinstance(attn_config, dict):
            self.attn_config = DbrxAttentionConfig(**attn_config)
        else:
            self.attn_config = attn_config

        if ffn_config is None:
            self.ffn_config = DbrxFFNConfig()
        elif isinstance(ffn_config, dict):
            self.ffn_config = DbrxFFNConfig(**ffn_config)
        else:
            self.ffn_config = ffn_config

        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.use_cache = use_cache
        self.initializer_range = initializer_range
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError(
                "tie_word_embeddings is not supported for Dbrx models."
            )

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

attn_config instance-attribute

attn_config = DbrxAttentionConfig()

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_attention_heads": "n_heads",
    "hidden_size": "d_model",
    "num_hidden_layers": "n_layers",
    "max_position_embeddings": "max_seq_len",
}

d_model instance-attribute

d_model = d_model

emb_pdrop instance-attribute

emb_pdrop = emb_pdrop

ffn_config instance-attribute

ffn_config = DbrxFFNConfig()

initializer_range instance-attribute

initializer_range = initializer_range

max_seq_len instance-attribute

max_seq_len = max_seq_len

model_type class-attribute instance-attribute

model_type = 'dbrx'

n_heads instance-attribute

n_heads = n_heads

n_layers instance-attribute

n_layers = n_layers

output_router_logits instance-attribute

output_router_logits = output_router_logits

resid_pdrop instance-attribute

resid_pdrop = resid_pdrop

router_aux_loss_coef instance-attribute

router_aux_loss_coef = router_aux_loss_coef

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    d_model: int = 2048,
    n_heads: int = 16,
    n_layers: int = 24,
    max_seq_len: int = 2048,
    vocab_size: int = 32000,
    resid_pdrop: float = 0.0,
    emb_pdrop: float = 0.0,
    attn_config: Optional[DbrxAttentionConfig] = None,
    ffn_config: Optional[DbrxFFNConfig] = None,
    use_cache: bool = True,
    initializer_range: float = 0.02,
    output_router_logits: bool = False,
    router_aux_loss_coef: float = 0.05,
    **kwargs: Any,
)
Source code in vllm/transformers_utils/configs/dbrx.py
def __init__(
    self,
    d_model: int = 2048,
    n_heads: int = 16,
    n_layers: int = 24,
    max_seq_len: int = 2048,
    vocab_size: int = 32000,
    resid_pdrop: float = 0.0,
    emb_pdrop: float = 0.0,
    attn_config: Optional[DbrxAttentionConfig] = None,
    ffn_config: Optional[DbrxFFNConfig] = None,
    use_cache: bool = True,
    initializer_range: float = 0.02,
    output_router_logits: bool = False,
    router_aux_loss_coef: float = 0.05,
    **kwargs: Any,
):
    if attn_config is None:
        self.attn_config = DbrxAttentionConfig()
    elif isinstance(attn_config, dict):
        self.attn_config = DbrxAttentionConfig(**attn_config)
    else:
        self.attn_config = attn_config

    if ffn_config is None:
        self.ffn_config = DbrxFFNConfig()
    elif isinstance(ffn_config, dict):
        self.ffn_config = DbrxFFNConfig(**ffn_config)
    else:
        self.ffn_config = ffn_config

    self.d_model = d_model
    self.n_heads = n_heads
    self.n_layers = n_layers
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.resid_pdrop = resid_pdrop
    self.emb_pdrop = emb_pdrop
    self.use_cache = use_cache
    self.initializer_range = initializer_range
    self.output_router_logits = output_router_logits
    self.router_aux_loss_coef = router_aux_loss_coef

    tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
    if tie_word_embeddings:
        raise ValueError(
            "tie_word_embeddings is not supported for Dbrx models."
        )

    super().__init__(
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

DeepseekVLV2Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_vl2.py
class DeepseekVLV2Config(PretrainedConfig):
    model_type = "deepseek_vl_v2"
    vision_config: VisionEncoderConfig
    projector_config: MlpProjectorConfig

    tile_tag: str = "2D"
    global_view_pos: str = "head"
    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )

    def __init__(self,
                 tile_tag: str = "tile_tag",
                 global_view_pos: str = "head",
                 candidate_resolutions: tuple[tuple[int,
                                                    int]] = ((384, 384), ),
                 **kwargs):
        super().__init__(**kwargs)

        vision_config = kwargs.get("vision_config", {})
        self.vision_config = VisionEncoderConfig(**vision_config)

        projector_config = kwargs.get("projector_config", {})
        self.projector_config = MlpProjectorConfig(**projector_config)

        language_config = kwargs.get("language_config", {})
        self.text_config = DeepseekV2Config(**language_config)

        self.tile_tag = tile_tag
        self.global_view_pos = global_view_pos
        self.candidate_resolutions = candidate_resolutions
        self.vocab_size = self.text_config.vocab_size

candidate_resolutions class-attribute instance-attribute

candidate_resolutions: tuple[tuple[int, int]] = (
    candidate_resolutions
)

global_view_pos class-attribute instance-attribute

global_view_pos: str = global_view_pos

model_type class-attribute instance-attribute

model_type = 'deepseek_vl_v2'

projector_config instance-attribute

projector_config: MlpProjectorConfig = MlpProjectorConfig(
    **projector_config
)

text_config instance-attribute

text_config = DeepseekV2Config(**language_config)

tile_tag class-attribute instance-attribute

tile_tag: str = tile_tag

vision_config instance-attribute

vision_config: VisionEncoderConfig = VisionEncoderConfig(
    **vision_config
)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    tile_tag: str = "tile_tag",
    global_view_pos: str = "head",
    candidate_resolutions: tuple[tuple[int, int]] = (
        (384, 384),
    ),
    **kwargs,
)
Source code in vllm/transformers_utils/configs/deepseek_vl2.py
def __init__(self,
             tile_tag: str = "tile_tag",
             global_view_pos: str = "head",
             candidate_resolutions: tuple[tuple[int,
                                                int]] = ((384, 384), ),
             **kwargs):
    super().__init__(**kwargs)

    vision_config = kwargs.get("vision_config", {})
    self.vision_config = VisionEncoderConfig(**vision_config)

    projector_config = kwargs.get("projector_config", {})
    self.projector_config = MlpProjectorConfig(**projector_config)

    language_config = kwargs.get("language_config", {})
    self.text_config = DeepseekV2Config(**language_config)

    self.tile_tag = tile_tag
    self.global_view_pos = global_view_pos
    self.candidate_resolutions = candidate_resolutions
    self.vocab_size = self.text_config.vocab_size

EAGLEConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/eagle.py
class EAGLEConfig(PretrainedConfig):
    model_type = "eagle"

    def __init__(self,
                 model: Union[PretrainedConfig, dict, None] = None,
                 truncated_vocab_size: Optional[int] = None,
                 method: Optional[str] = 'eagle',
                 **kwargs):

        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
        if isinstance(model, dict):
            archs = model.get("architectures", [])
            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
            if any(target_arch in archs for target_arch in target_archs):
                # AutoConfig does not support DeepSeek MoE models yet
                model_config = DeepseekV2Config(**model)
            else:
                model_config = AutoConfig.for_model(**model)
        else:
            model_config = model

        for k, v in kwargs.items():
            if k != "architectures" and k != "model_type" and hasattr(
                    model_config, k):
                setattr(model_config, k, v)

        self.model = model_config

        if self.model is None:
            self.truncated_vocab_size = None
        else:
            self.truncated_vocab_size = self.model.vocab_size if \
                truncated_vocab_size is None else truncated_vocab_size

        if not envs.VLLM_USE_V1:
            kwargs["architectures"] = ["EAGLEModel"]
        else:
            # Eagle model name should follow naming convention of
            # LlamaForCausalLM -> EagleLlamaForCausalLM
            if method == "eagle":
                assert self.model is not None, \
                    "model should not be None when method is eagle"
                kwargs["architectures"] = [
                    f"Eagle{arch}" if not arch.startswith("Eagle") \
                        else arch for arch in self.model.architectures
                ]
            elif method == "eagle3":
                assert self.model is not None, \
                    "model should not be None when method is eagle3"
                kwargs["architectures"] = [
                    f"Eagle3{arch}" if not arch.startswith("Eagle3") \
                        else arch for arch in self.model.architectures
                ]
            else:
                raise ValueError(f"Invalid method {method}. \
                    Supported methods are eagle and eagle3.")

        super().__init__(**kwargs)

        if self.model is not None:
            for k, v in self.model.to_dict().items():
                if k not in kwargs:
                    setattr(self, k, v)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "EAGLEConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

model instance-attribute

model = model_config

model_type class-attribute instance-attribute

model_type = 'eagle'

truncated_vocab_size instance-attribute

truncated_vocab_size = None

__init__

__init__(
    model: Union[PretrainedConfig, dict, None] = None,
    truncated_vocab_size: Optional[int] = None,
    method: Optional[str] = "eagle",
    **kwargs,
)
Source code in vllm/transformers_utils/configs/eagle.py
def __init__(self,
             model: Union[PretrainedConfig, dict, None] = None,
             truncated_vocab_size: Optional[int] = None,
             method: Optional[str] = 'eagle',
             **kwargs):

    model_config: Union[PretrainedConfig, DeepseekV2Config, None]
    if isinstance(model, dict):
        archs = model.get("architectures", [])
        target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
        if any(target_arch in archs for target_arch in target_archs):
            # AutoConfig does not support DeepSeek MoE models yet
            model_config = DeepseekV2Config(**model)
        else:
            model_config = AutoConfig.for_model(**model)
    else:
        model_config = model

    for k, v in kwargs.items():
        if k != "architectures" and k != "model_type" and hasattr(
                model_config, k):
            setattr(model_config, k, v)

    self.model = model_config

    if self.model is None:
        self.truncated_vocab_size = None
    else:
        self.truncated_vocab_size = self.model.vocab_size if \
            truncated_vocab_size is None else truncated_vocab_size

    if not envs.VLLM_USE_V1:
        kwargs["architectures"] = ["EAGLEModel"]
    else:
        # Eagle model name should follow naming convention of
        # LlamaForCausalLM -> EagleLlamaForCausalLM
        if method == "eagle":
            assert self.model is not None, \
                "model should not be None when method is eagle"
            kwargs["architectures"] = [
                f"Eagle{arch}" if not arch.startswith("Eagle") \
                    else arch for arch in self.model.architectures
            ]
        elif method == "eagle3":
            assert self.model is not None, \
                "model should not be None when method is eagle3"
            kwargs["architectures"] = [
                f"Eagle3{arch}" if not arch.startswith("Eagle3") \
                    else arch for arch in self.model.architectures
            ]
        else:
            raise ValueError(f"Invalid method {method}. \
                Supported methods are eagle and eagle3.")

    super().__init__(**kwargs)

    if self.model is not None:
        for k, v in self.model.to_dict().items():
            if k not in kwargs:
                setattr(self, k, v)

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> EAGLEConfig
Source code in vllm/transformers_utils/configs/eagle.py
@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "EAGLEConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    return cls.from_dict(config_dict, **kwargs)

ExaoneConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a :class: ~transformers.ExaoneModel. It is used to instantiate a GPT Lingvo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Exaone

Configuration objects inherit from {class}~transformers.PretrainedConfig and can be used to control the model outputs. Read the documentation from : class:~transformers.PretrainedConfig for more information.

Parameters:

Name Type Description Default
vocab_size {obj}`int`, `optional`, defaults to 50257

Vocabulary size of the GPT Lingvo model. Defines the number of different tokens that can be represented by the {obj}inputs_ids passed when calling {class}~transformers.ExaoneModel. Vocabulary size of the model. Defines the different tokens that can be represented by the inputs_ids passed to the forward method of :class: ~transformers.EXAONEModel.

102400
hidden_size {obj}`int`, `optional`, defaults to 2048

Dimensionality of the encoder layers and the pooler layer.

2048
num_layers {obj}`int`, `optional`, defaults to 24

Number of hidden layers in the Transformer encoder.

32
num_attention_heads `int`, *optional*, defaults to 32

Number of attention heads for each attention layer in the Transformer decoder.

32
num_key_value_heads `int`, *optional*

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default tonum_attention_heads`.

None
rotary_pct `float`, *optional*, defaults to 0.25

percentage of hidden dimensions to allocate to rotary embeddings

0.25
intermediate_size {obj}`int`, `optional`, defaults to 8192

Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.

None
defaults to {obj}`"gelu_new"`

The non-linear activation function (function or string) in the encoder and pooler. If string, {obj}"gelu", {obj}"relu", {obj}"selu" and {obj}"gelu_new" are supported.

required
embed_dropout {obj}`float`, `optional`, defaults to 0.0

The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.

0.0
attention_dropout {obj}`float`, `optional`, defaults to 0.0

The dropout ratio for the attention probabilities.

0.0
max_position_embeddings {obj}`int`, `optional`, defaults to 2048

The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).

2048
type_vocab_size {obj}`int`, `optional`, defaults to 2

The vocabulary size of the {obj}token_type_ids passed when calling {class}~transformers.EXAONEModel.

required
initializer_range {obj}`float`, `optional`, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
layer_norm_epsilon {obj}`float`, `optional`, defaults to 1e-5

The epsilon used by the layer normalization layers.

1e-06
use_cache {obj}`bool`, `optional`, defaults to {obj}`True`

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

True
defaults to {obj}`False`

If True, use gradient checkpointing to save memory at the expense of slower backward pass.

required
Example

:

from transformers import ExoneModel, ExaoneConfig

Initializing a EXAONE configuration

configuration = ExaoneConfig()

Initializing a model from configuration

model = ExoneModel(configuration)

Accessing the model configuration

configuration = model.config

required
Source code in vllm/transformers_utils/configs/exaone.py
class ExaoneConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a :class:
    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
    according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Exaone

    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
    and can be used to control the model outputs. Read the documentation from :
    class:`~transformers.PretrainedConfig` for more information.

    Args:
        vocab_size ({obj}`int`, `optional`, defaults to 50257):
            Vocabulary size of the GPT Lingvo model. Defines the number of
            different tokens that can be represented by the {obj}`inputs_ids`
            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
            size of the model.
            Defines the different tokens that can be represented by the
            `inputs_ids` passed to the forward method of :class:
            `~transformers.EXAONEModel`.
        hidden_size ({obj}`int`, `optional`, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
        num_layers ({obj}`int`, `optional`, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi
            Head Attention (MHA), if `num_key_value_heads=1 the model will use
            Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint,
            each group key and value head should be constructed by meanpooling
            all the original heads within that group. For more details checkout
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
            specified, will default to `num_attention_heads`.
        rotary_pct (`float`, *optional*, defaults to 0.25):
            percentage of hidden dimensions to allocate to rotary embeddings
        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
            the Transformer encoder.
        activation_function ({obj}`str` or {obj}`function`, `optional`,
        defaults to {obj}`"gelu_new"`):
            The non-linear activation function (function or string) in the
            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the
            embeddings, encoder, and pooler.
        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
            Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
            The vocabulary size of the {obj}`token_type_ids` passed when calling
            {class}`~transformers.EXAONEModel`.
        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
            Only relevant if ``config.is_decoder=True``.
        gradient_checkpointing ({obj}`bool`, `optional`,
        defaults to {obj}`False`):
            If True, use gradient checkpointing to save memory at the expense
            of slower backward pass.
        Example::

            >>> from transformers import ExoneModel, ExaoneConfig

            >>> # Initializing a EXAONE configuration
            >>> configuration = ExaoneConfig()

            >>> # Initializing a model from configuration
            >>> model = ExoneModel(configuration)

            >>> # Accessing the model configuration
            >>> configuration = model.config
    """

    model_type = "exaone"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_hidden_layers": "num_layers"}

    def __init__(
        self,
        vocab_size=102400,
        max_position_embeddings=2048,
        hidden_size=2048,
        num_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        intermediate_size=None,
        activation_function="silu",
        rotary_pct=0.25,
        resid_dropout=0.0,
        embed_dropout=0.0,
        attention_dropout=0.0,
        layer_norm_epsilon=1e-6,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=0,
        eos_token_id=2,
        tie_word_embeddings=True,
        **kwargs,
    ):
        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_layers
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        if intermediate_size:
            self.intermediate_size = intermediate_size
        else:
            self.intermediate_size = hidden_size * 4
        self.activation_function = activation_function
        self.resid_dropout = resid_dropout
        self.embed_dropout = embed_dropout
        self.attention_dropout = attention_dropout
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.rotary_pct = rotary_pct

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
        self.use_gated = kwargs.pop("use_gated", False)
        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
        self.rotary_type = kwargs.pop("rotary_type", None)
        self.scaling_factor = kwargs.pop("scaling_factor", 1)
        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
                                                 (rotary_pct == 0.25))
        if self.use_rotary_pos:
            self.use_absolute_pos = False

activation_function instance-attribute

activation_function = activation_function

attention_dropout instance-attribute

attention_dropout = attention_dropout

attribute_map class-attribute instance-attribute

attribute_map = {'num_hidden_layers': 'num_layers'}

bos_token_id instance-attribute

bos_token_id = bos_token_id

embed_dropout instance-attribute

embed_dropout = embed_dropout

eos_token_id instance-attribute

eos_token_id = eos_token_id

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

ln_no_scale instance-attribute

ln_no_scale = pop('ln_no_scale', False)

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

model_type class-attribute instance-attribute

model_type = 'exaone'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_layers instance-attribute

num_layers = num_layers

rescale_before_lm_head instance-attribute

rescale_before_lm_head = pop(
    "rescale_before_lm_head", rotary_pct == 0.25
)

resid_dropout instance-attribute

resid_dropout = resid_dropout

rotary_base instance-attribute

rotary_base = pop('rotary_base', 10000.0)

rotary_expand_length instance-attribute

rotary_expand_length = pop('rotary_expand_length', None)

rotary_pct instance-attribute

rotary_pct = rotary_pct

rotary_type instance-attribute

rotary_type = pop('rotary_type', None)

scaling_factor instance-attribute

scaling_factor = pop('scaling_factor', 1)

use_absolute_pos instance-attribute

use_absolute_pos = pop('use_absolute_pos', True)

use_cache instance-attribute

use_cache = use_cache

use_emb_norm instance-attribute

use_emb_norm = pop('use_emb_norm', False)

use_extra_logit instance-attribute

use_extra_logit = pop('use_extra_logit', True)

use_gated instance-attribute

use_gated = pop('use_gated', False)

use_logit_cap instance-attribute

use_logit_cap = pop('use_logit_cap', False)

use_qkv_fuse instance-attribute

use_qkv_fuse = pop('use_qkv_fuse', False)

use_rotary_pos instance-attribute

use_rotary_pos = pop('use_rotary_pos', False)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=102400,
    max_position_embeddings=2048,
    hidden_size=2048,
    num_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    intermediate_size=None,
    activation_function="silu",
    rotary_pct=0.25,
    resid_dropout=0.0,
    embed_dropout=0.0,
    attention_dropout=0.0,
    layer_norm_epsilon=1e-06,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=0,
    eos_token_id=2,
    tie_word_embeddings=True,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/exaone.py
def __init__(
    self,
    vocab_size=102400,
    max_position_embeddings=2048,
    hidden_size=2048,
    num_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    intermediate_size=None,
    activation_function="silu",
    rotary_pct=0.25,
    resid_dropout=0.0,
    embed_dropout=0.0,
    attention_dropout=0.0,
    layer_norm_epsilon=1e-6,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=0,
    eos_token_id=2,
    tie_word_embeddings=True,
    **kwargs,
):
    super().__init__(
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_attention_heads = num_attention_heads
    self.num_hidden_layers = num_layers
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads
    self.num_key_value_heads = num_key_value_heads
    if intermediate_size:
        self.intermediate_size = intermediate_size
    else:
        self.intermediate_size = hidden_size * 4
    self.activation_function = activation_function
    self.resid_dropout = resid_dropout
    self.embed_dropout = embed_dropout
    self.attention_dropout = attention_dropout
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.rotary_pct = rotary_pct

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    self.use_logit_cap = kwargs.pop("use_logit_cap", False)
    self.ln_no_scale = kwargs.pop("ln_no_scale", False)
    self.use_gated = kwargs.pop("use_gated", False)
    self.use_emb_norm = kwargs.pop("use_emb_norm", False)
    self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
    self.rotary_type = kwargs.pop("rotary_type", None)
    self.scaling_factor = kwargs.pop("scaling_factor", 1)
    self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
    self.use_extra_logit = kwargs.pop("use_extra_logit", True)
    self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
    self.rotary_base = kwargs.pop("rotary_base", 10000.0)
    self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
    self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
                                             (rotary_pct == 0.25))
    if self.use_rotary_pos:
        self.use_absolute_pos = False

JAISConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [JAISModel]. It is used to instantiate a JAIS model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 50257

Vocabulary size of the JAIS model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [JAISModel].

50257
n_positions `int`, *optional*, defaults to 1024

The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).

1024
n_embd `int`, *optional*, defaults to 768

Dimensionality of the embeddings and hidden states.

768
n_layer `int`, *optional*, defaults to 12

Number of hidden layers in the Transformer encoder.

12
n_head `int`, *optional*, defaults to 12

Number of attention heads for each attention layer in the Transformer encoder.

12
n_inner `int`, *optional*, defaults to None

Dimensionality of the inner feed-forward layers. None will set it to 4 times n_embd

None
activation_function `str`, *optional*, defaults to `"gelu"`

Activation function, to be selected in the list ["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"].

'gelu_new'
resid_pdrop `float`, *optional*, defaults to 0.1

The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.

0.1
embd_pdrop `float`, *optional*, defaults to 0.1

The dropout ratio for the embeddings.

0.1
attn_pdrop `float`, *optional*, defaults to 0.1

The dropout ratio for the attention.

0.1
layer_norm_epsilon `float`, *optional*, defaults to 1e-5

The epsilon to use in the layer normalization layers.

1e-05
initializer_range `float`, *optional*, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
scale_attn_weights `bool`, *optional*, defaults to `True`

Scale attention weights by dividing by sqrt(hidden_size)..

True
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models).

True
reorder_and_upcast_attn `bool`, *optional*, defaults to `False`

Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention dot-product/softmax to float() when training with mixed precision.

False
position_embedding_type `str`, *optional*, defaults to `"learned"`

Positional embedding can be either "alibi" or "learned".

'learned'
mup_width_scale `float`, *optional*, defaults to 1.0

muP parameter to scale learning rate and initializers. Calculated as (d_model,0 / d_model), where d_model is the model's width and d_model,0 is the proxy model's width.

1.0
mup_embeddings_scale `float`, *optional*, defaults to 1.0

muP parameter to scale token and position embeddings.

1.0
mup_output_alpha `float`, *optional*, defaults to 1.0

muP parameter to scale output logits (output_logits_scale = mup_output_alpha * mup_width_scale).

1.0
mup_scale_qk_dot_by_d `bool`, *optional*, defaults to `False`

Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to True as well.

False
alibi_scaling `dict`, *optional*

Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling factor (must be a float greater than 1) for fixed scaling or train_seq_len for dynamic scaling on input samples with sequence length > train_seq_len. The expected formats are {"type": strategy name, "factor": scaling factor} or {"type": strategy name, "train_seq_len": training sequence length}.

None
architectures `list`, *optional*, defaults to ['JAISLMHeadModel']

architecture names for Jais.

None

Example:

>>> from transformers import JAISConfig, JAISModel

>>> # Initializing a JAIS configuration
>>> configuration = JAISConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = JAISModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
Source code in vllm/transformers_utils/configs/jais.py
class JAISConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
            defaults to `False`):
            Whether to additionally scale attention weights by
            `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "jais"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        scale_attn_by_inverse_layer_idx=False,
        reorder_and_upcast_attn=False,
        position_embedding_type="learned",
        mup_width_scale=1.0,
        mup_embeddings_scale=1.0,
        mup_output_alpha=1.0,
        mup_scale_qk_dot_by_d=False,
        alibi_scaling=None,
        architectures=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.reorder_and_upcast_attn = reorder_and_upcast_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        self.position_embedding_type = position_embedding_type
        self.mup_width_scale = mup_width_scale
        self.mup_embeddings_scale = mup_embeddings_scale
        self.mup_output_alpha = mup_output_alpha
        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

        self.alibi_scaling = alibi_scaling
        self._alibi_scaling_validation()
        if architectures is None:
            architectures = ["JAISLMHeadModel"]

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            architectures=architectures,
            **kwargs,
        )

    def _alibi_scaling_validation(self):
        """
        Validate the `alibi_scaling` configuration.
        """
        if self.alibi_scaling is None:
            return

        if (not isinstance(self.alibi_scaling, dict)
                or len(self.alibi_scaling) != 2):
            raise ValueError(
                "`alibi_scaling` must be a dictionary with two fields, "
                "`type` and `factor` or `type` and `train_seq_len`, "
                f"got {self.alibi_scaling}")
        alibi_scaling_type = self.alibi_scaling.get("type", None)
        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
        if alibi_scaling_type is None or alibi_scaling_type != "linear":
            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                             f"got {alibi_scaling_type}")
        if (alibi_scaling_factor is not None
                and not isinstance(alibi_scaling_factor, float)
                or (alibi_scaling_factor is not None
                    and alibi_scaling_factor <= 1.0)):
            raise ValueError(
                f"`alibi_scaling`'s factor field must be a float > 1.0, "
                f"got {alibi_scaling_factor}")
        if (alibi_dynamic_scaling is not None
                and not isinstance(alibi_dynamic_scaling, int)
                or (alibi_dynamic_scaling is not None
                    and alibi_dynamic_scaling <= 1)):
            raise ValueError(
                f"`alibi_scaling`'s `train_seq_len` field must be an "
                f"integer > 1, got {alibi_dynamic_scaling}")

activation_function instance-attribute

activation_function = activation_function

alibi_scaling instance-attribute

alibi_scaling = alibi_scaling

attn_pdrop instance-attribute

attn_pdrop = attn_pdrop

attribute_map class-attribute instance-attribute

attribute_map = {
    "hidden_size": "n_embd",
    "max_position_embeddings": "n_positions",
    "num_attention_heads": "n_head",
    "num_hidden_layers": "n_layer",
}

bos_token_id instance-attribute

bos_token_id = bos_token_id

embd_pdrop instance-attribute

embd_pdrop = embd_pdrop

eos_token_id instance-attribute

eos_token_id = eos_token_id

initializer_range instance-attribute

initializer_range = initializer_range

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

model_type class-attribute instance-attribute

model_type = 'jais'

mup_embeddings_scale instance-attribute

mup_embeddings_scale = mup_embeddings_scale

mup_output_alpha instance-attribute

mup_output_alpha = mup_output_alpha

mup_scale_qk_dot_by_d instance-attribute

mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

mup_width_scale instance-attribute

mup_width_scale = mup_width_scale

n_embd instance-attribute

n_embd = n_embd

n_head instance-attribute

n_head = n_head

n_inner instance-attribute

n_inner = n_inner

n_layer instance-attribute

n_layer = n_layer

n_positions instance-attribute

n_positions = n_positions

position_embedding_type instance-attribute

position_embedding_type = position_embedding_type

reorder_and_upcast_attn instance-attribute

reorder_and_upcast_attn = reorder_and_upcast_attn

resid_pdrop instance-attribute

resid_pdrop = resid_pdrop

scale_attn_by_inverse_layer_idx instance-attribute

scale_attn_by_inverse_layer_idx = (
    scale_attn_by_inverse_layer_idx
)

scale_attn_weights instance-attribute

scale_attn_weights = scale_attn_weights

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/jais.py
def __init__(
    self,
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.scale_attn_weights = scale_attn_weights
    self.use_cache = use_cache
    self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
    self.reorder_and_upcast_attn = reorder_and_upcast_attn

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    self.position_embedding_type = position_embedding_type
    self.mup_width_scale = mup_width_scale
    self.mup_embeddings_scale = mup_embeddings_scale
    self.mup_output_alpha = mup_output_alpha
    self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

    self.alibi_scaling = alibi_scaling
    self._alibi_scaling_validation()
    if architectures is None:
        architectures = ["JAISLMHeadModel"]

    super().__init__(
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        architectures=architectures,
        **kwargs,
    )

_alibi_scaling_validation

_alibi_scaling_validation()

Validate the alibi_scaling configuration.

Source code in vllm/transformers_utils/configs/jais.py
def _alibi_scaling_validation(self):
    """
    Validate the `alibi_scaling` configuration.
    """
    if self.alibi_scaling is None:
        return

    if (not isinstance(self.alibi_scaling, dict)
            or len(self.alibi_scaling) != 2):
        raise ValueError(
            "`alibi_scaling` must be a dictionary with two fields, "
            "`type` and `factor` or `type` and `train_seq_len`, "
            f"got {self.alibi_scaling}")
    alibi_scaling_type = self.alibi_scaling.get("type", None)
    alibi_scaling_factor = self.alibi_scaling.get("factor", None)
    alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
    if alibi_scaling_type is None or alibi_scaling_type != "linear":
        raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                         f"got {alibi_scaling_type}")
    if (alibi_scaling_factor is not None
            and not isinstance(alibi_scaling_factor, float)
            or (alibi_scaling_factor is not None
                and alibi_scaling_factor <= 1.0)):
        raise ValueError(
            f"`alibi_scaling`'s factor field must be a float > 1.0, "
            f"got {alibi_scaling_factor}")
    if (alibi_dynamic_scaling is not None
            and not isinstance(alibi_dynamic_scaling, int)
            or (alibi_dynamic_scaling is not None
                and alibi_dynamic_scaling <= 1)):
        raise ValueError(
            f"`alibi_scaling`'s `train_seq_len` field must be an "
            f"integer > 1, got {alibi_dynamic_scaling}")

KimiVLConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/kimi_vl.py
class KimiVLConfig(PretrainedConfig):
    model_type = "kimi_vl"

    def __init__(self,
                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
                 ignore_index: int = -100,
                 media_placeholder_token_id: int = 163605,
                 pad_token_id: int = 0,
                 **kwargs):
        if vision_config is None:
            vision_config = MoonViTConfig()
        elif isinstance(vision_config, dict):
            vision_config = MoonViTConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = DeepseekV2Config()
        elif isinstance(text_config, dict):
            text_config = DeepseekV2Config(**text_config)
        self.text_config = text_config

        self.ignore_index = ignore_index
        self.media_placeholder_token_id = media_placeholder_token_id

        super().__init__(pad_token_id=pad_token_id, **kwargs)

ignore_index instance-attribute

ignore_index = ignore_index

media_placeholder_token_id instance-attribute

media_placeholder_token_id = media_placeholder_token_id

model_type class-attribute instance-attribute

model_type = 'kimi_vl'

text_config instance-attribute

text_config = text_config

vision_config instance-attribute

vision_config = vision_config

__init__

__init__(
    vision_config: Optional[
        Union[dict, MoonViTConfig]
    ] = None,
    text_config: Optional[
        Union[dict, DeepseekV2Config]
    ] = None,
    ignore_index: int = -100,
    media_placeholder_token_id: int = 163605,
    pad_token_id: int = 0,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/kimi_vl.py
def __init__(self,
             vision_config: Optional[Union[dict, MoonViTConfig]] = None,
             text_config: Optional[Union[dict, DeepseekV2Config]] = None,
             ignore_index: int = -100,
             media_placeholder_token_id: int = 163605,
             pad_token_id: int = 0,
             **kwargs):
    if vision_config is None:
        vision_config = MoonViTConfig()
    elif isinstance(vision_config, dict):
        vision_config = MoonViTConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = DeepseekV2Config()
    elif isinstance(text_config, dict):
        text_config = DeepseekV2Config(**text_config)
    self.text_config = text_config

    self.ignore_index = ignore_index
    self.media_placeholder_token_id = media_placeholder_token_id

    super().__init__(pad_token_id=pad_token_id, **kwargs)

MLPSpeculatorConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/mlp_speculator.py
class MLPSpeculatorConfig(PretrainedConfig):
    model_type = "mlp_speculator"

    attribute_map = {
        "hidden_size": "emb_dim",
    }

    def __init__(self,
                 vocab_size: int = 32000,
                 emb_dim: int = 4096,
                 inner_dim: int = 0,
                 n_predict: int = 3,
                 top_k_tokens_per_head: Optional[list[int]] = None,
                 n_candidates: int = 5,
                 tie_weights: bool = False,
                 scale_input: bool = False,
                 **kwargs):
        """
        Initialize an MLPSpeculatorConfig

        Args:
            vocab_size: int
                the model vocab size
            emb_dim: int
                the model embedding dimension
            inner_dim: int
                the inner dimension of the model. If 0, will be the emb_dim.
            n_predict: int
                the number of lookaheads for the speculator
            top_k_tokens_per_head: list[int]
                Number of tokens to consider from each head when forming the
                candidate tree.
                For each candidate branch in the tree, head n produces topk[n]
                additional sub-branches.
                NOTE: This parameter is currently unused.
            n_candidates: int
                number of child candidates to create per sequence
            tie_weights: bool
                If true, use a single set of weights for every model
                head/stage after the first. The initial projection
                from the base model may have a different size, so that
                stays separate.
            scale_input: bool
                if True, will scale the initial hidden states from
                the base model.
        """
        if top_k_tokens_per_head is None:
            top_k_tokens_per_head = [5, 4, 3]
        assert len(top_k_tokens_per_head) == n_predict
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.inner_dim = inner_dim
        self.n_predict = n_predict
        self.top_k_tokens_per_head = top_k_tokens_per_head
        self.n_candidates = n_candidates
        self.num_lookahead_tokens = n_predict
        self.tie_weights = tie_weights
        self.scale_input = scale_input

        super().__init__(**kwargs)

attribute_map class-attribute instance-attribute

attribute_map = {'hidden_size': 'emb_dim'}

emb_dim instance-attribute

emb_dim = emb_dim

inner_dim instance-attribute

inner_dim = inner_dim

model_type class-attribute instance-attribute

model_type = 'mlp_speculator'

n_candidates instance-attribute

n_candidates = n_candidates

n_predict instance-attribute

n_predict = n_predict

num_lookahead_tokens instance-attribute

num_lookahead_tokens = n_predict

scale_input instance-attribute

scale_input = scale_input

tie_weights instance-attribute

tie_weights = tie_weights

top_k_tokens_per_head instance-attribute

top_k_tokens_per_head = top_k_tokens_per_head

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size: int = 32000,
    emb_dim: int = 4096,
    inner_dim: int = 0,
    n_predict: int = 3,
    top_k_tokens_per_head: Optional[list[int]] = None,
    n_candidates: int = 5,
    tie_weights: bool = False,
    scale_input: bool = False,
    **kwargs,
)

Initialize an MLPSpeculatorConfig

Parameters:

Name Type Description Default
vocab_size int

int the model vocab size

32000
emb_dim int

int the model embedding dimension

4096
inner_dim int

int the inner dimension of the model. If 0, will be the emb_dim.

0
n_predict int

int the number of lookaheads for the speculator

3
top_k_tokens_per_head Optional[list[int]]

list[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] additional sub-branches. NOTE: This parameter is currently unused.

None
n_candidates int

int number of child candidates to create per sequence

5
tie_weights bool

bool If true, use a single set of weights for every model head/stage after the first. The initial projection from the base model may have a different size, so that stays separate.

False
scale_input bool

bool if True, will scale the initial hidden states from the base model.

False
Source code in vllm/transformers_utils/configs/mlp_speculator.py
def __init__(self,
             vocab_size: int = 32000,
             emb_dim: int = 4096,
             inner_dim: int = 0,
             n_predict: int = 3,
             top_k_tokens_per_head: Optional[list[int]] = None,
             n_candidates: int = 5,
             tie_weights: bool = False,
             scale_input: bool = False,
             **kwargs):
    """
    Initialize an MLPSpeculatorConfig

    Args:
        vocab_size: int
            the model vocab size
        emb_dim: int
            the model embedding dimension
        inner_dim: int
            the inner dimension of the model. If 0, will be the emb_dim.
        n_predict: int
            the number of lookaheads for the speculator
        top_k_tokens_per_head: list[int]
            Number of tokens to consider from each head when forming the
            candidate tree.
            For each candidate branch in the tree, head n produces topk[n]
            additional sub-branches.
            NOTE: This parameter is currently unused.
        n_candidates: int
            number of child candidates to create per sequence
        tie_weights: bool
            If true, use a single set of weights for every model
            head/stage after the first. The initial projection
            from the base model may have a different size, so that
            stays separate.
        scale_input: bool
            if True, will scale the initial hidden states from
            the base model.
    """
    if top_k_tokens_per_head is None:
        top_k_tokens_per_head = [5, 4, 3]
    assert len(top_k_tokens_per_head) == n_predict
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.inner_dim = inner_dim
    self.n_predict = n_predict
    self.top_k_tokens_per_head = top_k_tokens_per_head
    self.n_candidates = n_candidates
    self.num_lookahead_tokens = n_predict
    self.tie_weights = tie_weights
    self.scale_input = scale_input

    super().__init__(**kwargs)

MPTConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/mpt.py
class MPTConfig(PretrainedConfig):
    model_type = 'mpt'
    attribute_map = {
        'num_attention_heads': 'n_heads',
        'hidden_size': 'd_model',
        'num_hidden_layers': 'n_layers',
    }

    # pylint: disable=dangerous-default-value
    def __init__(self,
                 d_model: int = 2048,
                 n_heads: int = 16,
                 n_layers: int = 24,
                 expansion_ratio: int = 4,
                 max_seq_len: int = 2048,
                 vocab_size: int = 50368,
                 resid_pdrop: float = 0.0,
                 emb_pdrop: float = 0.0,
                 learned_pos_emb: bool = True,
                 attn_config: dict = attn_config_defaults,
                 ffn_config: dict = ffn_config_defaults,
                 init_device: str = 'cpu',
                 logit_scale: Optional[Union[float, str]] = None,
                 no_bias: bool = False,
                 embedding_fraction: float = 1.0,
                 norm_type: str = 'low_precision_layernorm',
                 use_cache: bool = False,
                 init_config: dict = init_config_defaults,
                 fc_type: str = 'torch',
                 verbose: Optional[int] = None,
                 **kwargs: Any):
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.expansion_ratio = expansion_ratio
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.learned_pos_emb = learned_pos_emb
        self.attn_config = attn_config
        self.ffn_config = ffn_config
        self.init_device = init_device
        self.logit_scale = logit_scale
        self.no_bias = no_bias
        self.embedding_fraction = embedding_fraction
        self.norm_type = norm_type
        self.use_cache = use_cache
        self.init_config = init_config
        self.fc_type = fc_type
        if verbose is not None:
            warnings.warn(DeprecationWarning(
                'verbose argument for MPTConfig is now ignored and '
                'will be removed. Use python_log_level instead.'),
                          stacklevel=2)
        if 'name' in kwargs:
            del kwargs['name']
        if 'loss_fn' in kwargs:
            del kwargs['loss_fn']
        if self.attn_config.get('alibi', False):
            self.learned_pos_emb = False
            warnings.warn(
                f'alibi is turned on, setting `learned_pos_emb` '
                f'to {self.learned_pos_emb}`',
                stacklevel=2)
        super().__init__(**kwargs)
        self._validate_config()

    def _set_config_defaults(
            self, config: dict[str, Any],
            config_defaults: dict[str, Any]) -> dict[str, Any]:
        for (k, v) in config_defaults.items():
            if k not in config:
                config[k] = v
        return config

    def _validate_config(self) -> None:
        self.attn_config = self._set_config_defaults(self.attn_config,
                                                     attn_config_defaults)
        self.ffn_config = self._set_config_defaults(self.ffn_config,
                                                    ffn_config_defaults)
        self.init_config = self._set_config_defaults(self.init_config,
                                                     init_config_defaults)
        if self.d_model % self.n_heads != 0:
            raise ValueError('d_model must be divisible by n_heads')
        if any(
                prob < 0 or prob > 1 for prob in
            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
             ]):
            raise ValueError(
                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
                "probabilities and must be between 0 and 1")
        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
            raise ValueError(
                f"Unknown attn_impl={self.attn_config['attn_impl']}")
        if self.attn_config['prefix_lm'] and self.attn_config[
                'attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError(
                'prefix_lm only implemented with torch and triton attention.')
        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
                'torch', 'triton'
        ]:
            raise NotImplementedError(
                'alibi only implemented with torch and triton attention.')
        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
                'attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError(
                'attn_uses_sequence_id only implemented with torch '
                'and triton attention.')
        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
            raise ValueError(
                'model.embedding_fraction must be between 0 (exclusive) '
                'and 1 (inclusive)!')
        if isinstance(self.logit_scale,
                      str) and self.logit_scale != 'inv_sqrt_d_model':
            raise ValueError(
                f"self.logit_scale={self.logit_scale!r} is not recognized as "
                "an option; use numeric value or 'inv_sqrt_d_model'.")
        if self.init_config.get('name', None) is None:
            raise ValueError(
                f"self.init_config={self.init_config!r} 'name' needs to be set."
            )
        if not self.learned_pos_emb and (not self.attn_config['alibi']):
            warnings.warn(
                'Positional information not being provided to the model.',
                stacklevel=2)
        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
            try:
                # pylint: disable=import-outside-toplevel
                import transformer_engine.pytorch as te
                del te
            except Exception as exc:
                raise ImportError(
                    'TransformerEngine import fail. `fc_type: te` requires '
                    'TransformerEngine be installed. '
                    'The required version of transformer_engine also requires '
                    'FlashAttention v1.0.6 is installed:\n'
                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
                ) from exc
        if self.ffn_config['ffn_type'] == 'mptmlp':
            self.ffn_config['fc_type'] = self.fc_type
        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
            self.ffn_config['bias'] = not self.no_bias

attn_config instance-attribute

attn_config = attn_config

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_attention_heads": "n_heads",
    "hidden_size": "d_model",
    "num_hidden_layers": "n_layers",
}

d_model instance-attribute

d_model = d_model

emb_pdrop instance-attribute

emb_pdrop = emb_pdrop

embedding_fraction instance-attribute

embedding_fraction = embedding_fraction

expansion_ratio instance-attribute

expansion_ratio = expansion_ratio

fc_type instance-attribute

fc_type = fc_type

ffn_config instance-attribute

ffn_config = ffn_config

init_config instance-attribute

init_config = init_config

init_device instance-attribute

init_device = init_device

learned_pos_emb instance-attribute

learned_pos_emb = learned_pos_emb

logit_scale instance-attribute

logit_scale = logit_scale

max_seq_len instance-attribute

max_seq_len = max_seq_len

model_type class-attribute instance-attribute

model_type = 'mpt'

n_heads instance-attribute

n_heads = n_heads

n_layers instance-attribute

n_layers = n_layers

no_bias instance-attribute

no_bias = no_bias

norm_type instance-attribute

norm_type = norm_type

resid_pdrop instance-attribute

resid_pdrop = resid_pdrop

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    d_model: int = 2048,
    n_heads: int = 16,
    n_layers: int = 24,
    expansion_ratio: int = 4,
    max_seq_len: int = 2048,
    vocab_size: int = 50368,
    resid_pdrop: float = 0.0,
    emb_pdrop: float = 0.0,
    learned_pos_emb: bool = True,
    attn_config: dict = attn_config_defaults,
    ffn_config: dict = ffn_config_defaults,
    init_device: str = "cpu",
    logit_scale: Optional[Union[float, str]] = None,
    no_bias: bool = False,
    embedding_fraction: float = 1.0,
    norm_type: str = "low_precision_layernorm",
    use_cache: bool = False,
    init_config: dict = init_config_defaults,
    fc_type: str = "torch",
    verbose: Optional[int] = None,
    **kwargs: Any,
)
Source code in vllm/transformers_utils/configs/mpt.py
def __init__(self,
             d_model: int = 2048,
             n_heads: int = 16,
             n_layers: int = 24,
             expansion_ratio: int = 4,
             max_seq_len: int = 2048,
             vocab_size: int = 50368,
             resid_pdrop: float = 0.0,
             emb_pdrop: float = 0.0,
             learned_pos_emb: bool = True,
             attn_config: dict = attn_config_defaults,
             ffn_config: dict = ffn_config_defaults,
             init_device: str = 'cpu',
             logit_scale: Optional[Union[float, str]] = None,
             no_bias: bool = False,
             embedding_fraction: float = 1.0,
             norm_type: str = 'low_precision_layernorm',
             use_cache: bool = False,
             init_config: dict = init_config_defaults,
             fc_type: str = 'torch',
             verbose: Optional[int] = None,
             **kwargs: Any):
    self.d_model = d_model
    self.n_heads = n_heads
    self.n_layers = n_layers
    self.expansion_ratio = expansion_ratio
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.resid_pdrop = resid_pdrop
    self.emb_pdrop = emb_pdrop
    self.learned_pos_emb = learned_pos_emb
    self.attn_config = attn_config
    self.ffn_config = ffn_config
    self.init_device = init_device
    self.logit_scale = logit_scale
    self.no_bias = no_bias
    self.embedding_fraction = embedding_fraction
    self.norm_type = norm_type
    self.use_cache = use_cache
    self.init_config = init_config
    self.fc_type = fc_type
    if verbose is not None:
        warnings.warn(DeprecationWarning(
            'verbose argument for MPTConfig is now ignored and '
            'will be removed. Use python_log_level instead.'),
                      stacklevel=2)
    if 'name' in kwargs:
        del kwargs['name']
    if 'loss_fn' in kwargs:
        del kwargs['loss_fn']
    if self.attn_config.get('alibi', False):
        self.learned_pos_emb = False
        warnings.warn(
            f'alibi is turned on, setting `learned_pos_emb` '
            f'to {self.learned_pos_emb}`',
            stacklevel=2)
    super().__init__(**kwargs)
    self._validate_config()

_set_config_defaults

_set_config_defaults(
    config: dict[str, Any], config_defaults: dict[str, Any]
) -> dict[str, Any]
Source code in vllm/transformers_utils/configs/mpt.py
def _set_config_defaults(
        self, config: dict[str, Any],
        config_defaults: dict[str, Any]) -> dict[str, Any]:
    for (k, v) in config_defaults.items():
        if k not in config:
            config[k] = v
    return config

_validate_config

_validate_config() -> None
Source code in vllm/transformers_utils/configs/mpt.py
def _validate_config(self) -> None:
    self.attn_config = self._set_config_defaults(self.attn_config,
                                                 attn_config_defaults)
    self.ffn_config = self._set_config_defaults(self.ffn_config,
                                                ffn_config_defaults)
    self.init_config = self._set_config_defaults(self.init_config,
                                                 init_config_defaults)
    if self.d_model % self.n_heads != 0:
        raise ValueError('d_model must be divisible by n_heads')
    if any(
            prob < 0 or prob > 1 for prob in
        [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
         ]):
        raise ValueError(
            "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
            "probabilities and must be between 0 and 1")
    if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
        raise ValueError(
            f"Unknown attn_impl={self.attn_config['attn_impl']}")
    if self.attn_config['prefix_lm'] and self.attn_config[
            'attn_impl'] not in ['torch', 'triton']:
        raise NotImplementedError(
            'prefix_lm only implemented with torch and triton attention.')
    if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
            'torch', 'triton'
    ]:
        raise NotImplementedError(
            'alibi only implemented with torch and triton attention.')
    if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
            'attn_impl'] not in ['torch', 'triton']:
        raise NotImplementedError(
            'attn_uses_sequence_id only implemented with torch '
            'and triton attention.')
    if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
        raise ValueError(
            'model.embedding_fraction must be between 0 (exclusive) '
            'and 1 (inclusive)!')
    if isinstance(self.logit_scale,
                  str) and self.logit_scale != 'inv_sqrt_d_model':
        raise ValueError(
            f"self.logit_scale={self.logit_scale!r} is not recognized as "
            "an option; use numeric value or 'inv_sqrt_d_model'.")
    if self.init_config.get('name', None) is None:
        raise ValueError(
            f"self.init_config={self.init_config!r} 'name' needs to be set."
        )
    if not self.learned_pos_emb and (not self.attn_config['alibi']):
        warnings.warn(
            'Positional information not being provided to the model.',
            stacklevel=2)
    if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
        try:
            # pylint: disable=import-outside-toplevel
            import transformer_engine.pytorch as te
            del te
        except Exception as exc:
            raise ImportError(
                'TransformerEngine import fail. `fc_type: te` requires '
                'TransformerEngine be installed. '
                'The required version of transformer_engine also requires '
                'FlashAttention v1.0.6 is installed:\n'
                'pip install flash-attn==1.0.6 --no-build-isolation \n'
                'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
            ) from exc
    if self.ffn_config['ffn_type'] == 'mptmlp':
        self.ffn_config['fc_type'] = self.fc_type
    elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
        self.ffn_config['bias'] = not self.no_bias

MedusaConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/medusa.py
class MedusaConfig(PretrainedConfig):
    model_type = "medusa"

    def __init__(self,
                 hidden_size: int = 4096,
                 vocab_size: int = 32001,
                 num_heads: int = 5,
                 num_hidden_layers: int = 1,
                 max_paths: int = 64,
                 topk: int = 10,
                 truncated_vocab_size: Optional[int] = None,
                 **kwargs):

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_hidden_layers = num_hidden_layers
        self.max_paths = max_paths
        self.topk = topk
        self.max_seq_len = int(2**20)
        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
            else truncated_vocab_size
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["MedusaModel"]

        super().__init__(**kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "MedusaConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        for k in list(config_dict.keys()):
            if 'num' in k:
                if 'heads' in k:
                    config_dict["num_heads"] = config_dict.pop(k)
                elif 'layers' in k:
                    config_dict["num_hidden_layers"] = config_dict.pop(k)
        return cls.from_dict(config_dict, **kwargs)

    @property
    def num_attention_heads(self):
        return 0

    @property
    def num_lookahead_tokens(self):
        return self.num_heads

    @num_lookahead_tokens.setter
    def num_lookahead_tokens(self, num_lookahead_tokens: int):
        self.num_heads = num_lookahead_tokens

hidden_size instance-attribute

hidden_size = hidden_size

max_paths instance-attribute

max_paths = max_paths

max_seq_len instance-attribute

max_seq_len = int(2 ** 20)

model_type class-attribute instance-attribute

model_type = 'medusa'

num_attention_heads property

num_attention_heads

num_heads instance-attribute

num_heads = num_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_lookahead_tokens property writable

num_lookahead_tokens

topk instance-attribute

topk = topk

truncated_vocab_size instance-attribute

truncated_vocab_size = (
    vocab_size
    if truncated_vocab_size is None
    else truncated_vocab_size
)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    hidden_size: int = 4096,
    vocab_size: int = 32001,
    num_heads: int = 5,
    num_hidden_layers: int = 1,
    max_paths: int = 64,
    topk: int = 10,
    truncated_vocab_size: Optional[int] = None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/medusa.py
def __init__(self,
             hidden_size: int = 4096,
             vocab_size: int = 32001,
             num_heads: int = 5,
             num_hidden_layers: int = 1,
             max_paths: int = 64,
             topk: int = 10,
             truncated_vocab_size: Optional[int] = None,
             **kwargs):

    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.num_heads = num_heads
    self.num_hidden_layers = num_hidden_layers
    self.max_paths = max_paths
    self.topk = topk
    self.max_seq_len = int(2**20)
    self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
        else truncated_vocab_size
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["MedusaModel"]

    super().__init__(**kwargs)

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> MedusaConfig
Source code in vllm/transformers_utils/configs/medusa.py
@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "MedusaConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    for k in list(config_dict.keys()):
        if 'num' in k:
            if 'heads' in k:
                config_dict["num_heads"] = config_dict.pop(k)
            elif 'layers' in k:
                config_dict["num_hidden_layers"] = config_dict.pop(k)
    return cls.from_dict(config_dict, **kwargs)

MiniMaxText01Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/minimax_text_01.py
class MiniMaxText01Config(PretrainedConfig):
    model_type = "MiniMaxText01"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=14336,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=None,
        eos_token_id=None,
        tie_word_embeddings=False,
        rope_theta=1e6,
        sliding_window=None,
        attention_dropout=0.0,
        num_experts_per_tok=2,
        num_local_experts=8,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        router_jitter_noise=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout

        self.num_experts_per_tok = num_experts_per_tok
        self.num_local_experts = num_local_experts
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.router_jitter_noise = router_jitter_noise
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

attention_dropout instance-attribute

attention_dropout = attention_dropout

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

model_type class-attribute instance-attribute

model_type = 'MiniMaxText01'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_experts_per_tok instance-attribute

num_experts_per_tok = num_experts_per_tok

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_local_experts instance-attribute

num_local_experts = num_local_experts

output_router_logits instance-attribute

output_router_logits = output_router_logits

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_theta instance-attribute

rope_theta = rope_theta

router_aux_loss_coef instance-attribute

router_aux_loss_coef = router_aux_loss_coef

router_jitter_noise instance-attribute

router_jitter_noise = router_jitter_noise

sliding_window instance-attribute

sliding_window = sliding_window

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=32000,
    hidden_size=4096,
    intermediate_size=14336,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=8,
    hidden_act="silu",
    max_position_embeddings=4096 * 32,
    initializer_range=0.02,
    rms_norm_eps=1e-05,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=None,
    eos_token_id=None,
    tie_word_embeddings=False,
    rope_theta=1000000.0,
    sliding_window=None,
    attention_dropout=0.0,
    num_experts_per_tok=2,
    num_local_experts=8,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    router_jitter_noise=0.0,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/minimax_text_01.py
def __init__(
    self,
    vocab_size=32000,
    hidden_size=4096,
    intermediate_size=14336,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=8,
    hidden_act="silu",
    max_position_embeddings=4096 * 32,
    initializer_range=0.02,
    rms_norm_eps=1e-5,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=None,
    eos_token_id=None,
    tie_word_embeddings=False,
    rope_theta=1e6,
    sliding_window=None,
    attention_dropout=0.0,
    num_experts_per_tok=2,
    num_local_experts=8,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    router_jitter_noise=0.0,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.sliding_window = sliding_window

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.attention_dropout = attention_dropout

    self.num_experts_per_tok = num_experts_per_tok
    self.num_local_experts = num_local_experts
    self.output_router_logits = output_router_logits
    self.router_aux_loss_coef = router_aux_loss_coef
    self.router_jitter_noise = router_jitter_noise
    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

MiniMaxVL01Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/minimax_vl_01.py
class MiniMaxVL01Config(PretrainedConfig):
    model_type = "minimax_vl_01"

    def __init__(
        self,
        vision_config=None,
        text_config=None,
        ignore_index=-100,
        image_token_index=32000,
        projector_hidden_act="gelu",
        vision_feature_select_strategy="default",
        vision_feature_layer=-2,
        image_grid_pinpoints=None,
        tie_word_embeddings=False,
        image_seq_length=576,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
        self.image_seq_length = image_seq_length

        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError("vision_feature_select_strategy should " +
                             "be one of 'default', 'full'." +
                             f"Got: {vision_feature_select_strategy}")

        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.vision_feature_layer = vision_feature_layer
        image_grid_pinpoints = (
            image_grid_pinpoints if image_grid_pinpoints is not None else
            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
        self.image_grid_pinpoints = image_grid_pinpoints

        if isinstance(vision_config, dict):
            if "model_type" not in vision_config:
                vision_config["model_type"] = "clip_vision_model"
            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
                **vision_config)
        elif vision_config is None:
            vision_config = CONFIG_MAPPING["clip_vision_model"](
                intermediate_size=4096,
                hidden_size=1024,
                patch_size=14,
                image_size=336,
                num_hidden_layers=24,
                num_attention_heads=16,
                vocab_size=32000,
                projection_dim=768,
            )

        self.vision_config = vision_config

        if text_config is not None:
            text_config = MiniMaxText01Config(**text_config)
        else:
            text_config = MiniMaxText01Config()

        self.text_config = text_config

        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

ignore_index instance-attribute

ignore_index = ignore_index

image_grid_pinpoints instance-attribute

image_grid_pinpoints = image_grid_pinpoints

image_seq_length instance-attribute

image_seq_length = image_seq_length

image_token_index instance-attribute

image_token_index = image_token_index

model_type class-attribute instance-attribute

model_type = 'minimax_vl_01'

projector_hidden_act instance-attribute

projector_hidden_act = projector_hidden_act

text_config instance-attribute

text_config = text_config

vision_config instance-attribute

vision_config = vision_config

vision_feature_layer instance-attribute

vision_feature_layer = vision_feature_layer

vision_feature_select_strategy instance-attribute

vision_feature_select_strategy = (
    vision_feature_select_strategy
)

__init__

__init__(
    vision_config=None,
    text_config=None,
    ignore_index=-100,
    image_token_index=32000,
    projector_hidden_act="gelu",
    vision_feature_select_strategy="default",
    vision_feature_layer=-2,
    image_grid_pinpoints=None,
    tie_word_embeddings=False,
    image_seq_length=576,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/minimax_vl_01.py
def __init__(
    self,
    vision_config=None,
    text_config=None,
    ignore_index=-100,
    image_token_index=32000,
    projector_hidden_act="gelu",
    vision_feature_select_strategy="default",
    vision_feature_layer=-2,
    image_grid_pinpoints=None,
    tie_word_embeddings=False,
    image_seq_length=576,
    **kwargs,
):
    self.ignore_index = ignore_index
    self.image_token_index = image_token_index
    self.projector_hidden_act = projector_hidden_act
    self.image_seq_length = image_seq_length

    if vision_feature_select_strategy not in ["default", "full"]:
        raise ValueError("vision_feature_select_strategy should " +
                         "be one of 'default', 'full'." +
                         f"Got: {vision_feature_select_strategy}")

    self.vision_feature_select_strategy = vision_feature_select_strategy
    self.vision_feature_layer = vision_feature_layer
    image_grid_pinpoints = (
        image_grid_pinpoints if image_grid_pinpoints is not None else
        [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
    self.image_grid_pinpoints = image_grid_pinpoints

    if isinstance(vision_config, dict):
        if "model_type" not in vision_config:
            vision_config["model_type"] = "clip_vision_model"
        vision_config = CONFIG_MAPPING[vision_config["model_type"]](
            **vision_config)
    elif vision_config is None:
        vision_config = CONFIG_MAPPING["clip_vision_model"](
            intermediate_size=4096,
            hidden_size=1024,
            patch_size=14,
            image_size=336,
            num_hidden_layers=24,
            num_attention_heads=16,
            vocab_size=32000,
            projection_dim=768,
        )

    self.vision_config = vision_config

    if text_config is not None:
        text_config = MiniMaxText01Config(**text_config)
    else:
        text_config = MiniMaxText01Config()

    self.text_config = text_config

    super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

MllamaConfig

Bases: MllamaConfig

Source code in vllm/transformers_utils/configs/mllama.py
class MllamaConfig(mllama_hf_config.MllamaConfig):

    def __init__(
        self,
        text_config=None,
        **kwargs,
    ):
        if isinstance(text_config, dict):
            text_config = MllamaTextConfig(**text_config)
        super().__init__(text_config=text_config, **kwargs)

__init__

__init__(text_config=None, **kwargs)
Source code in vllm/transformers_utils/configs/mllama.py
def __init__(
    self,
    text_config=None,
    **kwargs,
):
    if isinstance(text_config, dict):
        text_config = MllamaTextConfig(**text_config)
    super().__init__(text_config=text_config, **kwargs)

MoonViTConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/moonvit.py
class MoonViTConfig(PretrainedConfig):
    model_type = "moonvit"

    def __init__(
            self,
            patch_size: int = 14,
            init_pos_emb_height: int = 64,
            init_pos_emb_width: int = 64,
            num_attention_heads: int = 16,
            num_hidden_layers: int = 27,
            hidden_size: int = 1152,
            intermediate_size: int = 4304,
            merge_kernel_size: tuple[int, int] = (2, 2),
            **kwargs,
    ):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        # Positional embedding config
        self.init_pos_emb_height = init_pos_emb_height
        self.init_pos_emb_width = init_pos_emb_width
        # Transformer config
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        # Patch merger config
        self.merge_kernel_size = merge_kernel_size

hidden_size instance-attribute

hidden_size = hidden_size

init_pos_emb_height instance-attribute

init_pos_emb_height = init_pos_emb_height

init_pos_emb_width instance-attribute

init_pos_emb_width = init_pos_emb_width

intermediate_size instance-attribute

intermediate_size = intermediate_size

merge_kernel_size instance-attribute

merge_kernel_size = merge_kernel_size

model_type class-attribute instance-attribute

model_type = 'moonvit'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

patch_size instance-attribute

patch_size = patch_size

__init__

__init__(
    patch_size: int = 14,
    init_pos_emb_height: int = 64,
    init_pos_emb_width: int = 64,
    num_attention_heads: int = 16,
    num_hidden_layers: int = 27,
    hidden_size: int = 1152,
    intermediate_size: int = 4304,
    merge_kernel_size: tuple[int, int] = (2, 2),
    **kwargs,
)
Source code in vllm/transformers_utils/configs/moonvit.py
def __init__(
        self,
        patch_size: int = 14,
        init_pos_emb_height: int = 64,
        init_pos_emb_width: int = 64,
        num_attention_heads: int = 16,
        num_hidden_layers: int = 27,
        hidden_size: int = 1152,
        intermediate_size: int = 4304,
        merge_kernel_size: tuple[int, int] = (2, 2),
        **kwargs,
):
    super().__init__(**kwargs)
    self.patch_size = patch_size
    # Positional embedding config
    self.init_pos_emb_height = init_pos_emb_height
    self.init_pos_emb_width = init_pos_emb_width
    # Transformer config
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    # Patch merger config
    self.merge_kernel_size = merge_kernel_size

NVLM_D_Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/nvlm_d.py
class NVLM_D_Config(PretrainedConfig):
    model_type = 'NVLM_D'
    is_composition = True

    def __init__(self, vision_config=None, llm_config=None, **kwargs):
        super().__init__(**kwargs)

        # Handle vision_config initialization
        if vision_config is None:
            vision_config = {}

        # Handle llm_config initialization
        if llm_config is None:
            llm_config = {}

        self.vision_config = PretrainedConfig(**vision_config)
        self.text_config = Qwen2Config(**llm_config)

is_composition class-attribute instance-attribute

is_composition = True

model_type class-attribute instance-attribute

model_type = 'NVLM_D'

text_config instance-attribute

text_config = Qwen2Config(**llm_config)

vision_config instance-attribute

vision_config = PretrainedConfig(**vision_config)

__init__

__init__(vision_config=None, llm_config=None, **kwargs)
Source code in vllm/transformers_utils/configs/nvlm_d.py
def __init__(self, vision_config=None, llm_config=None, **kwargs):
    super().__init__(**kwargs)

    # Handle vision_config initialization
    if vision_config is None:
        vision_config = {}

    # Handle llm_config initialization
    if llm_config is None:
        llm_config = {}

    self.vision_config = PretrainedConfig(**vision_config)
    self.text_config = Qwen2Config(**llm_config)

NemotronConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronModel]. It is used to instantiate an Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 256000

Vocabulary size of the Nemotron model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronModel]

256000
hidden_size `int`, *optional*, defaults to 6144

Dimension of the hidden representations.

6144
intermediate_size `int`, *optional*, defaults to 24576

Dimension of the MLP representations.

24576
num_hidden_layers `int`, *optional*, defaults to 32

Number of hidden layers in the Transformer decoder.

32
num_attention_heads `int`, *optional*, defaults to 48

Number of attention heads for each attention layer in the Transformer decoder.

48
head_dim `int`, *optional*

Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if None

None
num_key_value_heads `int`, *optional*

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default tonum_attention_heads`.

None
hidden_act `str` or `function`, *optional*, defaults to `"relu2"`

The non-linear activation function (function or string) in the decoder.

'relu2'
max_position_embeddings `int`, *optional*, defaults to 4096

The maximum sequence length that this model might ever be used with.

4096
initializer_range `float`, *optional*, defaults to 0.0134

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.0134
norm_eps `float`, *optional*, defaults to 1e-05

The epsilon used by the normalization layers.

1e-05
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

True
pad_token_id `int`, *optional*

Padding token id.

None
bos_token_id `int`, *optional*, defaults to 2

Beginning of stream token id.

2
eos_token_id `int`, *optional*, defaults to 3

End of stream token id.

3
tie_word_embeddings `bool`, *optional*, defaults to `False`

Whether to tie weight embeddings

False
rope_theta `float`, *optional*, defaults to 10000.0

The base period of the RoPE embeddings.

10000.0
partial_rotary_factor `float`, *optional*, defaults to 0.5

Percentage of the query and keys which will have rotary embedding.

0.5
attention_bias `bool`, *optional*, defaults to `False`

Whether to use a bias in the query, key, value and output projection layers during self-attention.

False
attention_dropout `float`, *optional*, defaults to 0.0

The dropout ratio for the attention probabilities.

0.0
mlp_bias `bool`, *optional*, defaults to `False`

Whether to use a bias in up_proj and down_proj layers in the MLP layers.

False
>>> from transformers import NemotronModel, NemotronConfig
>>> # Initializing a Nemotron nemotron-15b style configuration
>>> configuration = NemotronConfig()
>>> # Initializing a model from the nemotron-15b style configuration
>>> model = NemotronModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
Source code in vllm/transformers_utils/configs/nemotron.py
class NemotronConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronModel`]. It is used to instantiate an Nemotron model
    according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Nemotron-8B.

    Configuration objects inherit from [`PretrainedConfig`] and can be
    used to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Nemotron model. Defines the number of
            different tokens that can be represented by the
            `inputs_ids` passed when calling [`NemotronModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the
            Transformer decoder.
        head_dim (`int`, *optional*):
            Projection weights dimension in multi-head attention. Set to
            hidden_size // num_attention_heads if None
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention
            (MQA) otherwise GQA is used. When converting a multi-head
            checkpoint to a GQA checkpoint, each group key and value
            head should be constructed by meanpooling all the original
            heads within that group. For more details checkout 
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
            is not specified, will default to `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the
            decoder.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        initializer_range (`float`, *optional*, defaults to 0.0134):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 3):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj and down_proj layers in the MLP
            layers.

    ```python
    >>> from transformers import NemotronModel, NemotronConfig
    >>> # Initializing a Nemotron nemotron-15b style configuration
    >>> configuration = NemotronConfig()
    >>> # Initializing a model from the nemotron-15b style configuration
    >>> model = NemotronModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "nemotron"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=256000,
        hidden_size=6144,
        intermediate_size=24576,
        num_hidden_layers=32,
        num_attention_heads=48,
        head_dim=None,
        num_key_value_heads=None,
        hidden_act="relu2",
        max_position_embeddings=4096,
        initializer_range=0.0134,
        norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=2,
        eos_token_id=3,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        head_dim = head_dim or kwargs.get("kv_channels")
        self.head_dim = head_dim if head_dim is not None else (
            hidden_size // num_attention_heads)

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_eps = norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # for backward compatibility
        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
            "rope_percentage") or partial_rotary_factor
        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(
                self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, "
                f"`type` and `factor`, got {self.rope_scaling}")
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in [
                "linear", "dynamic"
        ]:
            raise ValueError(
                "`rope_scaling`'s type field must be one of ['linear', "
                f"'dynamic'], got {rope_scaling_type}")
        if rope_scaling_factor is None or not isinstance(
                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(
                "`rope_scaling`'s factor field must be a float > 1, got "
                f"{rope_scaling_factor}")

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

head_dim instance-attribute

head_dim = (
    head_dim
    if head_dim is not None
    else hidden_size // num_attention_heads
)

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_bias instance-attribute

mlp_bias = mlp_bias

model_type class-attribute instance-attribute

model_type = 'nemotron'

norm_eps instance-attribute

norm_eps = norm_eps

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

partial_rotary_factor instance-attribute

partial_rotary_factor = partial_rotary_factor

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-05,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/nemotron.py
def __init__(
    self,
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-5,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    head_dim = head_dim or kwargs.get("kv_channels")
    self.head_dim = head_dim if head_dim is not None else (
        hidden_size // num_attention_heads)

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.norm_eps = norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    # for backward compatibility
    partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
        "rope_percentage") or partial_rotary_factor
    self.partial_rotary_factor = partial_rotary_factor
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.mlp_bias = mlp_bias

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/nemotron.py
def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if not isinstance(self.rope_scaling, dict) or len(
            self.rope_scaling) != 2:
        raise ValueError(
            "`rope_scaling` must be a dictionary with two fields, "
            f"`type` and `factor`, got {self.rope_scaling}")
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    if rope_scaling_type is None or rope_scaling_type not in [
            "linear", "dynamic"
    ]:
        raise ValueError(
            "`rope_scaling`'s type field must be one of ['linear', "
            f"'dynamic'], got {rope_scaling_type}")
    if rope_scaling_factor is None or not isinstance(
            rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
        raise ValueError(
            "`rope_scaling`'s factor field must be a float > 1, got "
            f"{rope_scaling_factor}")

NemotronHConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronHModel]. It is used to instantiate a NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. Args: vocab_size (int, optional, defaults to 131072): Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronHModel] tie_word_embeddings (bool, optional, defaults to False): Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer. hidden_size (int, optional, defaults to 4096): Dimension of the hidden representations. intermediate_size (int, optional, defaults to 21504): Dimension of the MLP representations. num_hidden_layers (int, optional, defaults to 52): Number of hidden layers in the Transformer encoder. hybrid_override_pattern (str, optional, defaults to "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"): The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, : Attention, -: MLP num_attention_heads (int, optional, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. attention_head_dim (int, optional, defaults to 128): Dimension of each attention head. num_key_value_heads (int, optional, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. mlp_hidden_act (str, optional, defaults to "relu2"): The non-linear activation function in the MLP layers. attention_bias (bool, optional, defaults to False): Whether to use bias in attention layers. mlp_bias (bool, optional, defaults to False): Whether to use bias in MLP layers. use_bias (bool, optional, defaults to False): Whether to use bias in the model. initializer_range (float, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_epsilon (float, optional, defaults to 1e-5): The epsilon used by the layer normalization layers. residual_in_fp32 (bool, optional, defaults to False): Whether or not residuals should be in float32. If set to False residuals will keep the same dtype as the rest of the model. use_cache (bool, optional, defaults to True): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True. num_logits_to_keep (int or None, optional, defaults to 1): Number of prompt logits to calculate during generation. If None, all logits will be calculated. If an integer value, only last num_logits_to_keep logits will be calculated. pad_token_id (int, optional, defaults to 0): The id of the padding token. bos_token_id (int, optional, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (int, optional, defaults to 2): The id of the "end-of-sequence" token. sliding_window (int, optional, defaults to None): Sliding window attention window size. max_position_embeddings (int, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. attention_dropout (float, optional, defaults to 0.0): The dropout ratio for the attention probabilities. hidden_dropout (float, optional, defaults to 0.0): The dropout ratio for the hidden states. use_mamba_kernels (bool, optional, defaults to True): Flag indicating whether or not to use the fast mamba kernels. These are available only if mamba-ssm and causal-conv1d are installed, and the mamba modules are running on a CUDA device. ssm_state_size (int, optional, defaults to 128): The dimension of the mamba state space latents. mamba_num_heads (int, optional, defaults to 128): Number of heads in Mamba layers. mamba_n_groups (int, optional, defaults to 8): Number of groups in Mamba layers. mamba_head_dim (int, optional, defaults to 64): Dimension of each Mamba head. mamba_d_conv (int, optional, defaults to 4): The size of the mamba convolution kernel. mamba_expand (int, optional, defaults to 2): Expanding factor used to determine the mamba intermediate size. mamba_hidden_act (str, optional, defaults to "silu"): The non-linear activation function in the Mamba layers. mamba_dt_min (float, optional, defaults to 0.001): Minimum value for the time step in Mamba. mamba_dt_max (float, optional, defaults to 0.1): Maximum value for the time step in Mamba. mamba_dt_limit (tuple, optional, defaults to (0.0, float("inf"))): Limits for the time step in Mamba. mamba_dt_init_floor (float, optional, defaults to 1e-4): Floor value for time step initialization in Mamba. mamba_conv_bias (bool, optional, defaults to True): Whether to use bias in the convolution layer of the mamba mixer block. mamba_proj_bias (bool, optional, defaults to False): Whether to use bias in the input and output projections of the mamba mixer block. mamba_chunk_size (int, optional, defaults to 256): Size of chunks for Mamba processing. rescale_prenorm_residual (bool, optional*, defaults to True): Whether to rescale the pre-normalization residual connections.

Source code in vllm/transformers_utils/configs/nemotron_h.py
class NemotronHConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has a output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    """

    model_type = "nemotron_h"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=131072,
        tie_word_embeddings=False,
        hidden_size=4096,
        intermediate_size=21504,
        num_hidden_layers=52,
        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
        num_attention_heads=32,
        attention_head_dim=128,
        num_key_value_heads=8,  # nemo: num_query_groups
        mlp_hidden_act="relu2",
        attention_bias=False,
        mlp_bias=False,
        use_bias=False,
        initializer_range=0.02,  # nemo: init_method_std
        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
        residual_in_fp32=False,  #  Megatron Core default value
        use_cache=True,
        num_logits_to_keep=1,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        sliding_window=None,
        max_position_embeddings=4096,
        attention_dropout=0.0,
        hidden_dropout=0.0,  # * ADDED
        use_mamba_kernels=True,
        ssm_state_size=128,  # mamba_state_size
        mamba_num_heads=128,
        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
        mamba_head_dim=64,
        mamba_d_conv=4,
        mamba_expand=2,
        mamba_hidden_act="silu",
        mamba_dt_min=0.001,
        mamba_dt_max=0.1,
        mamba_dt_limit=(0.0, float("inf")),
        mamba_dt_init_floor=1e-4,
        mamba_conv_bias=True,
        mamba_proj_bias=False,
        mamba_chunk_size=256,
        rescale_prenorm_residual=True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.tie_word_embeddings = tie_word_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.hybrid_override_pattern = hybrid_override_pattern
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        self.sliding_window = sliding_window
        self.max_position_embeddings = max_position_embeddings
        self.attention_dropout = attention_dropout
        self.hidden_dropout = hidden_dropout

        # Validate hybrid_override_pattern
        # M: Mamba2, *: Attention, -: MLP
        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
            "hybrid_override_pattern must have same length as "
            "num_hidden_layers")
        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
            "hybrid_override_pattern must only contain characters "
            "'M', '*', or '-'")

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_hidden_act = mlp_hidden_act
        self.attention_bias = attention_bias
        self.mlp_bias = mlp_bias
        self.use_bias = use_bias
        self.initializer_range = initializer_range
        self.layer_norm_epsilon = layer_norm_epsilon
        self.residual_in_fp32 = residual_in_fp32

        self.use_cache = use_cache
        self.num_logits_to_keep = num_logits_to_keep

        self.use_mamba_kernels = use_mamba_kernels
        self.n_groups = mamba_n_groups
        self.mamba_head_dim = mamba_head_dim
        self.ssm_state_size = ssm_state_size
        self.mamba_num_heads = mamba_num_heads
        self.conv_kernel = mamba_d_conv
        self.expand = mamba_expand
        self.mamba_hidden_act = mamba_hidden_act
        self.time_step_min = mamba_dt_min
        self.time_step_max = mamba_dt_max
        self.time_step_limit = mamba_dt_limit
        self.time_step_floor = mamba_dt_init_floor
        self.use_conv_bias = mamba_conv_bias
        self.mamba_proj_bias = mamba_proj_bias
        self.chunk_size = mamba_chunk_size
        self.rescale_prenorm_residual = rescale_prenorm_residual

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def layers_block_type(self):
        return [
            "mamba" if self.hybrid_override_pattern[i] == "M" else
            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
            for i in range(self.num_hidden_layers)
        ]

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

attention_head_dim instance-attribute

attention_head_dim = attention_head_dim

chunk_size instance-attribute

chunk_size = mamba_chunk_size

conv_kernel instance-attribute

conv_kernel = mamba_d_conv

expand instance-attribute

expand = mamba_expand

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size

hybrid_override_pattern instance-attribute

hybrid_override_pattern = hybrid_override_pattern

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

layers_block_type property

layers_block_type

mamba_head_dim instance-attribute

mamba_head_dim = mamba_head_dim

mamba_hidden_act instance-attribute

mamba_hidden_act = mamba_hidden_act

mamba_num_heads instance-attribute

mamba_num_heads = mamba_num_heads

mamba_proj_bias instance-attribute

mamba_proj_bias = mamba_proj_bias

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_bias instance-attribute

mlp_bias = mlp_bias

mlp_hidden_act instance-attribute

mlp_hidden_act = mlp_hidden_act

model_type class-attribute instance-attribute

model_type = 'nemotron_h'

n_groups instance-attribute

n_groups = mamba_n_groups

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_logits_to_keep instance-attribute

num_logits_to_keep = num_logits_to_keep

rescale_prenorm_residual instance-attribute

rescale_prenorm_residual = rescale_prenorm_residual

residual_in_fp32 instance-attribute

residual_in_fp32 = residual_in_fp32

sliding_window instance-attribute

sliding_window = sliding_window

ssm_state_size instance-attribute

ssm_state_size = ssm_state_size

tie_word_embeddings instance-attribute

tie_word_embeddings = tie_word_embeddings

time_step_floor instance-attribute

time_step_floor = mamba_dt_init_floor

time_step_limit instance-attribute

time_step_limit = mamba_dt_limit

time_step_max instance-attribute

time_step_max = mamba_dt_max

time_step_min instance-attribute

time_step_min = mamba_dt_min

use_bias instance-attribute

use_bias = use_bias

use_cache instance-attribute

use_cache = use_cache

use_conv_bias instance-attribute

use_conv_bias = mamba_conv_bias

use_mamba_kernels instance-attribute

use_mamba_kernels = use_mamba_kernels

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    attention_head_dim=128,
    num_key_value_heads=8,
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,
    layer_norm_epsilon=1e-05,
    residual_in_fp32=False,
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,
    use_mamba_kernels=True,
    ssm_state_size=128,
    mamba_num_heads=128,
    mamba_n_groups=8,
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=0.0001,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/nemotron_h.py
def __init__(
    self,
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    attention_head_dim=128,
    num_key_value_heads=8,  # nemo: num_query_groups
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,  # nemo: init_method_std
    layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
    residual_in_fp32=False,  #  Megatron Core default value
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,  # * ADDED
    use_mamba_kernels=True,
    ssm_state_size=128,  # mamba_state_size
    mamba_num_heads=128,
    mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=1e-4,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.tie_word_embeddings = tie_word_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.hybrid_override_pattern = hybrid_override_pattern
    self.num_attention_heads = num_attention_heads
    self.attention_head_dim = attention_head_dim
    self.sliding_window = sliding_window
    self.max_position_embeddings = max_position_embeddings
    self.attention_dropout = attention_dropout
    self.hidden_dropout = hidden_dropout

    # Validate hybrid_override_pattern
    # M: Mamba2, *: Attention, -: MLP
    assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
        "hybrid_override_pattern must have same length as "
        "num_hidden_layers")
    assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
        "hybrid_override_pattern must only contain characters "
        "'M', '*', or '-'")

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.mlp_hidden_act = mlp_hidden_act
    self.attention_bias = attention_bias
    self.mlp_bias = mlp_bias
    self.use_bias = use_bias
    self.initializer_range = initializer_range
    self.layer_norm_epsilon = layer_norm_epsilon
    self.residual_in_fp32 = residual_in_fp32

    self.use_cache = use_cache
    self.num_logits_to_keep = num_logits_to_keep

    self.use_mamba_kernels = use_mamba_kernels
    self.n_groups = mamba_n_groups
    self.mamba_head_dim = mamba_head_dim
    self.ssm_state_size = ssm_state_size
    self.mamba_num_heads = mamba_num_heads
    self.conv_kernel = mamba_d_conv
    self.expand = mamba_expand
    self.mamba_hidden_act = mamba_hidden_act
    self.time_step_min = mamba_dt_min
    self.time_step_max = mamba_dt_max
    self.time_step_limit = mamba_dt_limit
    self.time_step_floor = mamba_dt_init_floor
    self.use_conv_bias = mamba_conv_bias
    self.mamba_proj_bias = mamba_proj_bias
    self.chunk_size = mamba_chunk_size
    self.rescale_prenorm_residual = rescale_prenorm_residual

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

OvisConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/ovis.py
class OvisConfig(PretrainedConfig):
    model_type = "ovis"

    def __init__(self,
                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                         dict]] = None,
                 multimodal_max_length=8192,
                 hidden_size=None,
                 conversation_formatter_class=None,
                 llm_attn_implementation=None,
                 disable_tie_weight=False,
                 **kwargs):
        super().__init__(**kwargs)
        if llm_config is not None:
            assert isinstance(llm_config, (PretrainedConfig, dict)), \
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
            if not isinstance(llm_config, PretrainedConfig):
                model_type = llm_config['model_type']
                llm_config.pop('model_type')
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
                model_type = visual_tokenizer_config['model_type']
                visual_tokenizer_config.pop('model_type')
                visual_tokenizer_config = AutoConfig.for_model(
                    model_type, **visual_tokenizer_config)

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length
        self.hidden_size = hidden_size
        self.conversation_formatter_class = conversation_formatter_class
        self.llm_attn_implementation = llm_attn_implementation
        self.disable_tie_weight = disable_tie_weight

conversation_formatter_class instance-attribute

conversation_formatter_class = conversation_formatter_class

disable_tie_weight instance-attribute

disable_tie_weight = disable_tie_weight

hidden_size instance-attribute

hidden_size = hidden_size

llm_attn_implementation instance-attribute

llm_attn_implementation = llm_attn_implementation

model_type class-attribute instance-attribute

model_type = 'ovis'

multimodal_max_length instance-attribute

multimodal_max_length = multimodal_max_length

text_config instance-attribute

text_config = llm_config

visual_tokenizer_config instance-attribute

visual_tokenizer_config = visual_tokenizer_config

__init__

__init__(
    llm_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    visual_tokenizer_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    multimodal_max_length=8192,
    hidden_size=None,
    conversation_formatter_class=None,
    llm_attn_implementation=None,
    disable_tie_weight=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/ovis.py
def __init__(self,
             llm_config: Optional[Union[PretrainedConfig, dict]] = None,
             visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                     dict]] = None,
             multimodal_max_length=8192,
             hidden_size=None,
             conversation_formatter_class=None,
             llm_attn_implementation=None,
             disable_tie_weight=False,
             **kwargs):
    super().__init__(**kwargs)
    if llm_config is not None:
        assert isinstance(llm_config, (PretrainedConfig, dict)), \
            f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
        if not isinstance(llm_config, PretrainedConfig):
            model_type = llm_config['model_type']
            llm_config.pop('model_type')
            llm_config = AutoConfig.for_model(model_type, **llm_config)

    # map llm_config to text_config
    self.text_config = llm_config
    if visual_tokenizer_config is not None:
        assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
            f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
        if not isinstance(visual_tokenizer_config, PretrainedConfig):
            model_type = visual_tokenizer_config['model_type']
            visual_tokenizer_config.pop('model_type')
            visual_tokenizer_config = AutoConfig.for_model(
                model_type, **visual_tokenizer_config)

    self.visual_tokenizer_config = visual_tokenizer_config
    self.multimodal_max_length = multimodal_max_length
    self.hidden_size = hidden_size
    self.conversation_formatter_class = conversation_formatter_class
    self.llm_attn_implementation = llm_attn_implementation
    self.disable_tie_weight = disable_tie_weight

RWConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/falcon.py
class RWConfig(PretrainedConfig):
    model_type = "falcon"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_kv_heads": "n_head_kv",
    }

    def __init__(
        self,
        vocab_size=250880,
        hidden_size=64,
        n_layer=2,
        n_head=8,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        multi_query=True,
        n_head_kv=None,
        alibi=False,
        bias=False,
        parallel_attn=False,
        new_decoder_architecture=False,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.multi_query = multi_query
        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
        self.alibi = alibi
        self.bias = bias
        self.parallel_attn = parallel_attn
        self.new_decoder_architecture = new_decoder_architecture

        if self.hidden_size == 8192:
            # Hack for falcon-40b
            self.new_decoder_architecture = True

        super().__init__(bos_token_id=bos_token_id,
                         eos_token_id=eos_token_id,
                         **kwargs)

    @property
    def head_dim(self):
        return self.hidden_size // self.n_head

    @property
    def rotary(self):
        return not self.alibi

alibi instance-attribute

alibi = alibi

attention_dropout instance-attribute

attention_dropout = attention_dropout

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_hidden_layers": "n_layer",
    "num_attention_heads": "n_head",
    "num_kv_heads": "n_head_kv",
}

bias instance-attribute

bias = bias

bos_token_id instance-attribute

bos_token_id = bos_token_id

eos_token_id instance-attribute

eos_token_id = eos_token_id

head_dim property

head_dim

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size if n_embed is None else n_embed

initializer_range instance-attribute

initializer_range = initializer_range

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

model_type class-attribute instance-attribute

model_type = 'falcon'

multi_query instance-attribute

multi_query = multi_query

n_head instance-attribute

n_head = n_head

n_head_kv instance-attribute

n_head_kv = 1 if n_head_kv is None else n_head_kv

n_layer instance-attribute

n_layer = n_layer

new_decoder_architecture instance-attribute

new_decoder_architecture = new_decoder_architecture

parallel_attn instance-attribute

parallel_attn = parallel_attn

rotary property

rotary

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/falcon.py
def __init__(
    self,
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None:
    self.vocab_size = vocab_size
    # Backward compatibility with n_embed kwarg
    n_embed = kwargs.pop("n_embed", None)
    self.hidden_size = hidden_size if n_embed is None else n_embed
    self.n_layer = n_layer
    self.n_head = n_head
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.multi_query = multi_query
    self.n_head_kv = 1 if n_head_kv is None else n_head_kv
    self.alibi = alibi
    self.bias = bias
    self.parallel_attn = parallel_attn
    self.new_decoder_architecture = new_decoder_architecture

    if self.hidden_size == 8192:
        # Hack for falcon-40b
        self.new_decoder_architecture = True

    super().__init__(bos_token_id=bos_token_id,
                     eos_token_id=eos_token_id,
                     **kwargs)

SkyworkR1VChatConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/skyworkr1v.py
class SkyworkR1VChatConfig(PretrainedConfig):
    model_type = 'internvl_chat'
    is_composition = True

    def __init__(self,
                 vision_config=None,
                 llm_config=None,
                 use_backbone_lora=0,
                 use_llm_lora=0,
                 select_layer=-1,
                 force_image_size=None,
                 downsample_ratio=0.5,
                 template=None,
                 dynamic_image_size=False,
                 use_thumbnail=False,
                 ps_version='v1',
                 min_dynamic_patch=1,
                 max_dynamic_patch=6,
                 **kwargs):
        super().__init__(**kwargs)

        if vision_config is None:
            vision_config = {}

        if llm_config is None:
            llm_config = {}

        self.vision_config = PretrainedConfig(**vision_config)
        self.text_config = PretrainedConfig(**llm_config)

        self.use_backbone_lora = use_backbone_lora
        self.use_llm_lora = use_llm_lora
        self.select_layer = select_layer
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template
        self.dynamic_image_size = dynamic_image_size
        self.use_thumbnail = use_thumbnail
        self.ps_version = ps_version  # pixel shuffle version
        self.min_dynamic_patch = min_dynamic_patch
        self.max_dynamic_patch = max_dynamic_patch

downsample_ratio instance-attribute

downsample_ratio = downsample_ratio

dynamic_image_size instance-attribute

dynamic_image_size = dynamic_image_size

force_image_size instance-attribute

force_image_size = force_image_size

is_composition class-attribute instance-attribute

is_composition = True

max_dynamic_patch instance-attribute

max_dynamic_patch = max_dynamic_patch

min_dynamic_patch instance-attribute

min_dynamic_patch = min_dynamic_patch

model_type class-attribute instance-attribute

model_type = 'internvl_chat'

ps_version instance-attribute

ps_version = ps_version

select_layer instance-attribute

select_layer = select_layer

template instance-attribute

template = template

text_config instance-attribute

text_config = PretrainedConfig(**llm_config)

use_backbone_lora instance-attribute

use_backbone_lora = use_backbone_lora

use_llm_lora instance-attribute

use_llm_lora = use_llm_lora

use_thumbnail instance-attribute

use_thumbnail = use_thumbnail

vision_config instance-attribute

vision_config = PretrainedConfig(**vision_config)

__init__

__init__(
    vision_config=None,
    llm_config=None,
    use_backbone_lora=0,
    use_llm_lora=0,
    select_layer=-1,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    dynamic_image_size=False,
    use_thumbnail=False,
    ps_version="v1",
    min_dynamic_patch=1,
    max_dynamic_patch=6,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/skyworkr1v.py
def __init__(self,
             vision_config=None,
             llm_config=None,
             use_backbone_lora=0,
             use_llm_lora=0,
             select_layer=-1,
             force_image_size=None,
             downsample_ratio=0.5,
             template=None,
             dynamic_image_size=False,
             use_thumbnail=False,
             ps_version='v1',
             min_dynamic_patch=1,
             max_dynamic_patch=6,
             **kwargs):
    super().__init__(**kwargs)

    if vision_config is None:
        vision_config = {}

    if llm_config is None:
        llm_config = {}

    self.vision_config = PretrainedConfig(**vision_config)
    self.text_config = PretrainedConfig(**llm_config)

    self.use_backbone_lora = use_backbone_lora
    self.use_llm_lora = use_llm_lora
    self.select_layer = select_layer
    self.force_image_size = force_image_size
    self.downsample_ratio = downsample_ratio
    self.template = template
    self.dynamic_image_size = dynamic_image_size
    self.use_thumbnail = use_thumbnail
    self.ps_version = ps_version  # pixel shuffle version
    self.min_dynamic_patch = min_dynamic_patch
    self.max_dynamic_patch = max_dynamic_patch

SolarConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [SolarModel]. It is used to instantiate an LLaMA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the LLaMA-7B. Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information. Args: vocab_size (int, optional, defaults to 32000): Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [SolarModel] hidden_size (int, optional, defaults to 4096): Dimension of the hidden representations. intermediate_size (int, optional, defaults to 11008): Dimension of the MLP representations. num_hidden_layers (int, optional, defaults to 32): Number of hidden layers in the Transformer decoder. num_attention_heads (int, optional, defaults to 32): Number of attention heads for each attention layer in the Transformer decoder. num_key_value_heads (int, optional): This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper] (https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to num_attention_heads. hidden_act (str or function, optional, defaults to "silu"): The non-linear activation function (function or string) in the decoder. max_position_embeddings (int, optional, defaults to 2048): The maximum sequence length that this model might ever be used with. Solar 1 supports up to 2048 tokens, Solar 2 up to 4096, CodeSolar up to 16384. initializer_range (float, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (float, optional, defaults to 1e-06): The epsilon used by the rms normalization layers. use_cache (bool, optional, defaults to True): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True. pad_token_id (int, optional): Padding token id. bos_token_id (int, optional, defaults to 1): Beginning of stream token id. eos_token_id (int, optional, defaults to 2): End of stream token id. pretraining_tp (int, optional, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to this document to understand more about it. This value is necessary to ensure exact reproducibility of the pretraining results. Please refer to this issue. tie_word_embeddings (bool, optional, defaults to False): Whether to tie weight embeddings rope_theta (float, optional, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (dict, optional): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is {"type": strategy name, "factor": scaling factor}. When using this flag, don't update max_position_embeddings to the expected new maximum. See the following thread for more information on how these scaling strategies behave: https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/ dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions. attention_bias (bool, optional, defaults to False): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (float, optional, defaults to 0.0): The dropout ratio for the attention probabilities. mlp_bias (bool, optional, defaults to False): Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. sliding_window (int, optional, defaults to 2047): Sliding window attention window size. If not specified, will default to 2047.

>>> from transformers import SolarModel, SolarConfig
>>> # Initializing a Solar-pro style configuration
>>> configuration = SolarConfig()
>>> # Initializing a model from the Solar-pro style configuration
>>> model = SolarModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/solar.py
class SolarConfig(PretrainedConfig):
    r"""
    This is the configuration class to store
    the configuration of a [`SolarModel`].
    It is used to instantiate an LLaMA model
    according to the specified arguments,
    defining the model architecture.
    Instantiating a configuration with the
    defaults will yield a similar
    configuration to that of the LLaMA-7B.
    Configuration objects inherit from [`PretrainedConfig`]
    and can be used to control the model outputs.
    Read the documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the LLaMA model.
            Defines the number of different tokens
            that can be represented by the `inputs_ids`
            passed when calling [`SolarModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer
            in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that
            should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`,
            the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA)
            otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint,
            each group key and value head should be constructed
            by meanpooling all the original heads within that group.
            For more details checkout [this paper]
            (https://arxiv.org/pdf/2305.13245.pdf).
            If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string)
            in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
            Solar 1 supports up to 2048 tokens,
            Solar 2 up to 4096, CodeSolar up to 16384.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of
            the truncated_normal_initializer for initializing
            all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return
            the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank
            used during pretraining.
            Please refer to [this
            document](https://huggingface.co/docs/
            transformers/main/
            perf_train_gpu_many#tensor-parallelism)
             to understand more about it. This value is
            necessary to ensure exact reproducibility
            of the pretraining results.
            Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for
            the RoPE embeddings.
            Currently supports two scaling
            strategies: linear and dynamic.
            Their scaling factor must be a float greater than 1.
            The expected format is
            `{"type": strategy name, "factor": scaling factor}`.
            When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
            See the following thread for more information on how
            these scaling strategies behave:
            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
            dynamically_scaled_rope_further_increases/. This is an
            experimental feature, subject to breaking
            API changes in future versions.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value
            and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj
            layers in the MLP layers.
        sliding_window (`int`, *optional*, defaults to 2047):
            Sliding window attention window size. If not specified,
            will default to `2047`.
    ```python
    >>> from transformers import SolarModel, SolarConfig
    >>> # Initializing a Solar-pro style configuration
    >>> configuration = SolarConfig()
    >>> # Initializing a model from the Solar-pro style configuration
    >>> model = SolarModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "solar"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        sliding_window=2047,
        bskcn_1=None,
        bskcn_2=None,
        bskcn_3=None,
        bskcn_4=None,
        bskcn_tv=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias
        self.sliding_window = sliding_window
        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if (not isinstance(self.rope_scaling, dict)
                or len(self.rope_scaling) != 2):
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields,"
                " `type` and `factor`, "
                f"got {self.rope_scaling}")
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in [
                "linear",
                "dynamic",
        ]:
            raise ValueError(f"`rope_scaling`'s type field must be one of "
                             f"['linear', 'dynamic'], got {rope_scaling_type}")
        if (rope_scaling_factor is None
                or not isinstance(rope_scaling_factor, float)
                or rope_scaling_factor <= 1.0):
            raise ValueError(
                f"`rope_scaling`'s factor field must be a float > 1,"
                f" got {rope_scaling_factor}")

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

bskcn_1 instance-attribute

bskcn_1 = (
    bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
)

bskcn_2 instance-attribute

bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]

bskcn_3 instance-attribute

bskcn_3 = (
    bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
)

bskcn_4 instance-attribute

bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]

bskcn_tv instance-attribute

bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_bias instance-attribute

mlp_bias = mlp_bias

model_type class-attribute instance-attribute

model_type = 'solar'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

pretraining_tp instance-attribute

pretraining_tp = pretraining_tp

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

sliding_window instance-attribute

sliding_window = sliding_window

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=32000,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=1,
    eos_token_id=2,
    pretraining_tp=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    sliding_window=2047,
    bskcn_1=None,
    bskcn_2=None,
    bskcn_3=None,
    bskcn_4=None,
    bskcn_tv=None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/solar.py
def __init__(
    self,
    vocab_size=32000,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=1,
    eos_token_id=2,
    pretraining_tp=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    sliding_window=2047,
    bskcn_1=None,
    bskcn_2=None,
    bskcn_3=None,
    bskcn_4=None,
    bskcn_tv=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.pretraining_tp = pretraining_tp
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.mlp_bias = mlp_bias
    self.sliding_window = sliding_window
    self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
    self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
    self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
    self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
    self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/solar.py
def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if (not isinstance(self.rope_scaling, dict)
            or len(self.rope_scaling) != 2):
        raise ValueError(
            "`rope_scaling` must be a dictionary with two fields,"
            " `type` and `factor`, "
            f"got {self.rope_scaling}")
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    if rope_scaling_type is None or rope_scaling_type not in [
            "linear",
            "dynamic",
    ]:
        raise ValueError(f"`rope_scaling`'s type field must be one of "
                         f"['linear', 'dynamic'], got {rope_scaling_type}")
    if (rope_scaling_factor is None
            or not isinstance(rope_scaling_factor, float)
            or rope_scaling_factor <= 1.0):
        raise ValueError(
            f"`rope_scaling`'s factor field must be a float > 1,"
            f" got {rope_scaling_factor}")

Telechat2Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/telechat2.py
class Telechat2Config(PretrainedConfig):

    model_type = "telechat"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "intermediate_size": "ffn_hidden_size",
        "rms_norm_eps": "layer_norm_epsilon"
    }

    def __init__(
        self,
        vocab_size=160256,
        hidden_size=4096,
        n_layer=30,
        n_head=32,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        apply_residual_connection_post_layernorm=False,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        ffn_hidden_size=12288,
        training_seqlen=8192,
        logn=True,
        embed_layernorm=False,
        hidden_act="silu",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.apply_residual_connection_post_layernorm = (
            apply_residual_connection_post_layernorm)
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.logn = logn
        self.training_seqlen = training_seqlen
        self.embed_layernorm = embed_layernorm
        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
        self.ffn_hidden_size = ffn_hidden_size
        self.hidden_act = hidden_act
        super().__init__(bos_token_id=bos_token_id,
                         eos_token_id=eos_token_id,
                         **kwargs)

apply_residual_connection_post_layernorm instance-attribute

apply_residual_connection_post_layernorm = (
    apply_residual_connection_post_layernorm
)

attention_dropout instance-attribute

attention_dropout = attention_dropout

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_hidden_layers": "n_layer",
    "num_attention_heads": "n_head",
    "intermediate_size": "ffn_hidden_size",
    "rms_norm_eps": "layer_norm_epsilon",
}

bos_token_id instance-attribute

bos_token_id = bos_token_id

embed_layernorm instance-attribute

embed_layernorm = embed_layernorm

eos_token_id instance-attribute

eos_token_id = eos_token_id

ffn_hidden_size instance-attribute

ffn_hidden_size = ffn_hidden_size

hidden_act instance-attribute

hidden_act = hidden_act

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size if n_embed is None else n_embed

initializer_range instance-attribute

initializer_range = initializer_range

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

logn instance-attribute

logn = logn

model_type class-attribute instance-attribute

model_type = 'telechat'

n_head instance-attribute

n_head = n_head

n_layer instance-attribute

n_layer = n_layer

num_key_value_heads instance-attribute

num_key_value_heads = pop('num_key_value_heads', None)

training_seqlen instance-attribute

training_seqlen = training_seqlen

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=160256,
    hidden_size=4096,
    n_layer=30,
    n_head=32,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    apply_residual_connection_post_layernorm=False,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    ffn_hidden_size=12288,
    training_seqlen=8192,
    logn=True,
    embed_layernorm=False,
    hidden_act="silu",
    **kwargs,
)
Source code in vllm/transformers_utils/configs/telechat2.py
def __init__(
    self,
    vocab_size=160256,
    hidden_size=4096,
    n_layer=30,
    n_head=32,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    apply_residual_connection_post_layernorm=False,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    ffn_hidden_size=12288,
    training_seqlen=8192,
    logn=True,
    embed_layernorm=False,
    hidden_act="silu",
    **kwargs,
):
    self.vocab_size = vocab_size
    n_embed = kwargs.pop("n_embed", None)
    self.hidden_size = hidden_size if n_embed is None else n_embed
    self.n_layer = n_layer
    self.n_head = n_head
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.apply_residual_connection_post_layernorm = (
        apply_residual_connection_post_layernorm)
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.logn = logn
    self.training_seqlen = training_seqlen
    self.embed_layernorm = embed_layernorm
    self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
    self.ffn_hidden_size = ffn_hidden_size
    self.hidden_act = hidden_act
    super().__init__(bos_token_id=bos_token_id,
                     eos_token_id=eos_token_id,
                     **kwargs)

UltravoxConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [UltravoxForConditionalGeneration]. It is used to instantiate an Ultravox model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
audio_config `Union[AutoConfig, dict]`, *optional*

Custom audio config or dict

None
text_config `Union[AutoConfig, dict]`, *optional*

The config object of the text backbone. Can be any of LlamaConfig or MistralConfig.

None
ignore_index `int`, *optional*, defaults to -100

The ignore index for the loss function.

-100
audio_token_index `int`, *optional*, defaults to 32000

The audio token index to encode the audio prompt.

32000
stack_factor `int`, *optional*, defaults to 8

Audio downsampling factor for the multimodal projector.

8
norm_init `float`, *optional*, defaults to 0.4

The initialization value for the layer normalization.

0.4
projector_act `str`, *optional*, defaults to `"swiglu"`

The activation function used by the multimodal projector.

'swiglu'
text_model_lora_config `LoraConfigSimplified`, *optional*

The LoRA configuration for finetuning the text model.

None
audio_model_lora_config `LoraConfigSimplified`, *optional*

The LoRA configuration for finetuning the audio model.

None
projector_ln_mid `bool`, *optional*, defaults to `False`

Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below use False, but v0.5 and above use True.

False
Source code in vllm/transformers_utils/configs/ultravox.py
class UltravoxConfig(transformers.PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone. Can be any of `LlamaConfig`
            or `MistralConfig`.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        text_model_lora_config (`LoraConfigSimplified`, *optional*):
            The LoRA configuration for finetuning the text model.
        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
            The LoRA configuration for finetuning the audio model.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    """

    model_type = "ultravox"
    is_composition = False

    def __init__(
        self,
        audio_config: Optional[dict[str, Any]] = None,
        text_config: Optional[dict[str, Any]] = None,
        audio_model_id: Optional[str] = None,
        text_model_id: Optional[str] = None,
        ignore_index: int = -100,
        audio_token_index: int = 32000,
        hidden_size: int = 4096,
        stack_factor: int = 8,
        norm_init: float = 0.4,
        projector_act: str = "swiglu",
        text_model_lora_config: Optional[dict[str, Any]] = None,
        audio_model_lora_config: Optional[dict[str, Any]] = None,
        projector_ln_mid: bool = False,
        **kwargs,
    ):
        self.ignore_index = ignore_index

        self.audio_model_id = audio_model_id
        self.text_model_id = text_model_id
        self.audio_token_index = audio_token_index

        self.hidden_size = hidden_size
        self.stack_factor = stack_factor
        self.norm_init = norm_init
        self.projector_act = projector_act
        self.projector_ln_mid = projector_ln_mid

        if text_model_id is not None:
            # Avoid circular import
            from vllm.transformers_utils.config import get_config

            self.text_config = get_config(text_model_id,
                                          trust_remote_code=False)
        else:
            text_config = text_config or {}
            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
                "model_type", "llama")](**text_config)

        if audio_model_id is not None:
            # Avoid circular import
            from vllm.transformers_utils.config import get_config

            self.audio_config = get_config(audio_model_id,
                                           trust_remote_code=False)
        else:
            audio_config = audio_config or {}
            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                "model_type", "whisper")](**audio_config)

        self.text_model_lora_config = text_model_lora_config or {}
        self.audio_model_lora_config = audio_model_lora_config or {}

        self.vocab_size = self.text_config.vocab_size

        self.initializer_range = self.text_config.initializer_range

        super().__init__(**kwargs)

audio_config instance-attribute

audio_config = get_config(
    audio_model_id, trust_remote_code=False
)

audio_model_id instance-attribute

audio_model_id = audio_model_id

audio_model_lora_config instance-attribute

audio_model_lora_config = audio_model_lora_config or {}

audio_token_index instance-attribute

audio_token_index = audio_token_index

hidden_size instance-attribute

hidden_size = hidden_size

ignore_index instance-attribute

ignore_index = ignore_index

initializer_range instance-attribute

initializer_range = initializer_range

is_composition class-attribute instance-attribute

is_composition = False

model_type class-attribute instance-attribute

model_type = 'ultravox'

norm_init instance-attribute

norm_init = norm_init

projector_act instance-attribute

projector_act = projector_act

projector_ln_mid instance-attribute

projector_ln_mid = projector_ln_mid

stack_factor instance-attribute

stack_factor = stack_factor

text_config instance-attribute

text_config = get_config(
    text_model_id, trust_remote_code=False
)

text_model_id instance-attribute

text_model_id = text_model_id

text_model_lora_config instance-attribute

text_model_lora_config = text_model_lora_config or {}

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    text_model_lora_config: Optional[dict[str, Any]] = None,
    audio_model_lora_config: Optional[
        dict[str, Any]
    ] = None,
    projector_ln_mid: bool = False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/ultravox.py
def __init__(
    self,
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    text_model_lora_config: Optional[dict[str, Any]] = None,
    audio_model_lora_config: Optional[dict[str, Any]] = None,
    projector_ln_mid: bool = False,
    **kwargs,
):
    self.ignore_index = ignore_index

    self.audio_model_id = audio_model_id
    self.text_model_id = text_model_id
    self.audio_token_index = audio_token_index

    self.hidden_size = hidden_size
    self.stack_factor = stack_factor
    self.norm_init = norm_init
    self.projector_act = projector_act
    self.projector_ln_mid = projector_ln_mid

    if text_model_id is not None:
        # Avoid circular import
        from vllm.transformers_utils.config import get_config

        self.text_config = get_config(text_model_id,
                                      trust_remote_code=False)
    else:
        text_config = text_config or {}
        self.text_config = transformers.CONFIG_MAPPING[text_config.get(
            "model_type", "llama")](**text_config)

    if audio_model_id is not None:
        # Avoid circular import
        from vllm.transformers_utils.config import get_config

        self.audio_config = get_config(audio_model_id,
                                       trust_remote_code=False)
    else:
        audio_config = audio_config or {}
        self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
            "model_type", "whisper")](**audio_config)

    self.text_model_lora_config = text_model_lora_config or {}
    self.audio_model_lora_config = audio_model_lora_config or {}

    self.vocab_size = self.text_config.vocab_size

    self.initializer_range = self.text_config.initializer_range

    super().__init__(**kwargs)