vllm.model_executor.layers.quantization.utils.bitblas_utils
BITBLAS_OPTIMIZE_FEATURES
module-attribute
¶
BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
module-attribute
¶
_check_bitblas_supported
¶
_check_bitblas_supported(
quant_type: ScalarType,
group_size: Optional[int],
has_zp: bool,
device_capability: Optional[int] = None,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
bitblas_is_k_full
¶
bitblas_make_empty_g_idx
¶
bitblas_make_empty_zp
¶
bitblas_repeat_scales_on_all_ranks
¶
bitblas_repeat_scales_on_all_ranks(
act_order: bool, group_size: int, is_row_parallel: bool
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
bitblas_sort_g_idx
¶
check_bitblas_supported
¶
check_bitblas_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
device_capability: Optional[int] = None,
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
check_bitblas_supports_shape
¶
check_bitblas_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
query_bitblas_supported_quant_types
¶
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
unpack_gptq_qweight
¶
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
unpack_gptq_qzeros
¶
unpack_gptq_qzeros(
qzeros, bits, is_gptq_v2=False
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
verify_bitblas_supported
¶
verify_bitblas_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
) -> None
Source code in vllm/model_executor/layers/quantization/utils/bitblas_utils.py
verify_bitblas_supports_shape
¶
verify_bitblas_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> None