vllm.model_executor.layers.quantization.compressed_tensors.utils

_find_first_match ¶

_find_first_match(
    value: str,
    targets: Iterable[str],
    check_contains: bool = False,
) -> Optional[str]

Returns first element of target that matches value either exactly or as a regex after 're:'. If check_contains is set to True, additionally checks if the target string is contained within the value.

:param value: string to compare the list of targets against :param targets: list of targets to match the layer against :param check_contains: whether or not to do a substring match

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def _find_first_match(value: str,
                      targets: Iterable[str],
                      check_contains: bool = False) -> Optional[str]:
    """
    Returns first element of target that matches value either
    exactly or as a regex after 're:'. If check_contains is set to True,
    additionally checks if the target string is contained within the value.

    :param value: string to compare the list of targets against
    :param targets: list of targets to match the layer against
    :param check_contains: whether or not to do a substring match
    """

    for target in targets:
        if _is_equal_or_regex_match(value,
                                    target,
                                    check_contains=check_contains):
            return target
    return None

_is_equal_or_regex_match ¶

_is_equal_or_regex_match(
    value: str, target: str, check_contains: bool = False
) -> bool

Checks whether a value is exactly equal or a regex match for target if target starts with 're:'. If check_contains is set to True, additionally checks if the target string is contained within the value.

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def _is_equal_or_regex_match(value: str,
                             target: str,
                             check_contains: bool = False) -> bool:
    """
    Checks whether a value is exactly equal or a regex match for target
    if target starts with 're:'. If check_contains is set to True,
    additionally checks if the target string is contained within the value.
    """

    if target.startswith("re:"):
        pattern = target[3:]
        if re.match(pattern, value):
            return True
    elif check_contains:
        if target.lower() in value.lower():
            return True
    elif target == value:
        return True
    return False

_match_fused_layer ¶

_match_fused_layer(
    layer_name: str,
    target_layers: Iterable[str],
    fused_mapping: Mapping[str, list[str]],
) -> Optional[str]

Match a fused layer name to its corresponding individual layer in target_layers. Returns first value in fused_mapping which matches targets

Implements an "all" matching strategy where a fused layer matches iff "all" of its components match

:param layer_name: layer name :param target_layers: list of targets to match the layer against :param fused_mapping: map from fused layer names to its components

Examples:

layer_name = "model.layers.0.self_attn.qkv_proj" target_layers = ["model.layers.0.self_attn.q_proj", "model.layers.0.self_attn.k_proj", "model.layers.0.self_attn.v_proj"]

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def _match_fused_layer(
        layer_name: str, target_layers: Iterable[str],
        fused_mapping: Mapping[str, list[str]]) -> Optional[str]:
    """
    Match a fused layer name to its corresponding individual layer in 
    target_layers. Returns first value in fused_mapping which matches targets

    Implements an "all" matching strategy where a fused layer matches iff
    "all" of its components match

    :param layer_name: layer name
    :param target_layers: list of targets to match the layer against
    :param fused_mapping: map from fused layer names to its components

    Examples:
        layer_name = "model.layers.0.self_attn.qkv_proj"
        target_layers = ["model.layers.0.self_attn.q_proj",
                        "model.layers.0.self_attn.k_proj",
                        "model.layers.0.self_attn.v_proj"]
    """
    # find layer_name in mapping
    fused = next((key for key in fused_mapping if layer_name.endswith(key)),
                 None)
    if fused is None:
        return None

    # expand path of unfused components
    unfused_paths = [
        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
    ]

    # for each unfused component, find a match in targets
    unfused_matches: list[Optional[str]] = []
    for unfused in unfused_paths:
        for target in target_layers:
            if _is_equal_or_regex_match(unfused, target):
                unfused_matches.append(target)
                break
        else:
            unfused_matches.append(None)

    return unfused_matches[0] if all(unfused_matches) else None

check_equal_or_regex_match ¶

check_equal_or_regex_match(
    layer_name: str, targets: Iterable[str]
) -> bool

Checks whether a layer_name is exactly equal or a regex match for if target starts with 're:' to any target in list.

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def check_equal_or_regex_match(layer_name: str,
                               targets: Iterable[str]) -> bool:
    """
    Checks whether a layer_name is exactly equal or a regex match for
    if target starts with 're:' to any target in list.
    """
    for target in targets:
        if _is_equal_or_regex_match(layer_name, target):
            return True
    return False

find_matched_target ¶

find_matched_target(
    layer_name: Optional[str],
    module: Module,
    targets: Iterable[str],
    fused_mapping: Mapping[
        str, list[str]
    ] = MappingProxyType({}),
) -> str

Helper function to look up which "target" in the compressed-tensors config that a layer corresponds to.

Recall that a compressed-tensors configs has a concept of config_groups, where each layer can be quantized with with a different scheme.

targets in each config_group will be a list of either layer names (or regexes corresponding to layer names) or names of torch Modules.

First, we try to match the layer_name with a target Second, we try to match the module's name with a target Third, we try to map the layer_name to a list of fused module names. All component module names must match in order for a match to be successful. A successful match returns the first component target

:param layer_name: layer name :param module: torch.nn.Module :param targets: list of targets to match the layer against :param fused_mapping: map from fused layer names to its components :param fused_strategy: either "all" or "any". If using "all", fused layers match if "all" of its components match

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def find_matched_target(
    layer_name: Optional[str],
    module: Module,
    targets: Iterable[str],
    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
) -> str:
    """
    Helper function to look up which "target" in the compressed-tensors
    config that a layer corresponds to.

    Recall that a compressed-tensors configs has a concept of
    config_groups, where each layer can be quantized with with a different
    scheme.

    targets in each config_group will be a list of either layer names
    (or regexes corresponding to layer names) or names of torch Modules.

    First, we try to match the layer_name with a target
    Second, we try to match the module's name with a target
    Third, we try to map the layer_name to a list of fused module names.
        *All* component module names must match in order for a match to be
        successful. A successful match returns the first component target

    :param layer_name: layer name
    :param module: torch.nn.Module
    :param targets: list of targets to match the layer against
    :param fused_mapping: map from fused layer names to its components
    :param fused_strategy: either "all" or "any". If using "all", fused
        layers match if "all" of its components match
    """

    if layer_name is None:
        layer_name = ""

    matched_target = (
        _find_first_match(layer_name, targets)
        or _find_first_match(module.__class__.__name__, targets, True)
        or _match_fused_layer(layer_name, targets, fused_mapping))

    if matched_target is None:
        raise ValueError(
            f"Unable to find matching target for {layer_name} in the "
            "compressed-tensors config.")

    return matched_target

is_activation_quantization_format ¶

is_activation_quantization_format(format: str) -> bool

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def is_activation_quantization_format(format: str) -> bool:
    _ACTIVATION_QUANTIZATION_FORMATS = [
        CompressionFormat.naive_quantized.value,
        CompressionFormat.int_quantized.value,
        CompressionFormat.float_quantized.value,
        CompressionFormat.nvfp4_pack_quantized.value
    ]
    return format in _ACTIVATION_QUANTIZATION_FORMATS

should_ignore_layer ¶

should_ignore_layer(
    layer_name: Optional[str],
    ignore: Iterable[str] = tuple(),
    fused_mapping: Mapping[
        str, list[str]
    ] = MappingProxyType({}),
) -> bool

Source code in vllm/model_executor/layers/quantization/compressed_tensors/utils.py

def should_ignore_layer(
    layer_name: Optional[str],
    ignore: Iterable[str] = tuple(),
    fused_mapping: Mapping[str, list[str]] = MappingProxyType({})
) -> bool:
    if layer_name is None:
        return False

    # layer_name = model.layers.0.self_attn.qkv_proj
    # proj_name = qkv_proj
    proj_name = layer_name.split(".")[-1]

    # Fused layers like gate_up_proj or qkv_proj will not be fused
    # in the safetensors checkpoint. So, we convert the name
    # from the fused version to unfused + check to make sure that
    # each shard of the fused layer has the same scheme.
    if proj_name in fused_mapping and layer_name not in ignore:
        shard_proj_names = fused_mapping[proj_name]

        # Convert fused_name --> [shard_names]
        shard_names = [
            layer_name.replace(proj_name, shard_proj_name)
            for shard_proj_name in shard_proj_names
        ]

        # Layer should be ignored if shards are ignored.
        should_ignore_layer = None
        for shard_name in shard_names:
            should_ignore_shard = check_equal_or_regex_match(
                layer_name=shard_name, targets=ignore)

            # If shard_idx=0, set layer ignore to match shard.
            if should_ignore_layer is None:
                should_ignore_layer = should_ignore_shard

            # If shard_idx=1+ confirm scheme matches prior shards.
            elif should_ignore_shard != should_ignore_layer:
                raise ValueError(f"Found a different quantization schemes for "
                                 f"{shard_proj_names} in {layer_name}. vLLM "
                                 "requires all to use the same scheme.")

    # Unfused layers like down_proj and o_proj will match
    # the safetensors checkpoint already.
    else:
        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
                                                         targets=ignore)

    assert should_ignore_layer is not None
    return should_ignore_layer