vllm.model_executor.layers.quantization.utils.marlin_utils
_check_marlin_supported
¶
_check_marlin_supported(
quant_type: ScalarType,
group_size: Optional[int],
has_zp: bool,
device_capability: Optional[int] = None,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
apply_awq_marlin_linear
¶
apply_awq_marlin_linear(
input: Tensor,
weight: Tensor,
weight_scale: Tensor,
weight_zp: Tensor,
g_idx: Tensor,
g_idx_sort_indices: Tensor,
workspace: Tensor,
quant_type: ScalarType,
output_size_per_partition: int,
input_size_per_partition: int,
bias: Optional[Tensor] = None,
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
apply_gptq_marlin_linear
¶
apply_gptq_marlin_linear(
input: Tensor,
weight: Tensor,
weight_scale: Tensor,
weight_zp: Tensor,
g_idx: Tensor,
g_idx_sort_indices: Tensor,
workspace: Tensor,
wtype: ScalarType,
output_size_per_partition: int,
input_size_per_partition: int,
is_k_full: bool,
bias: Optional[Tensor] = None,
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
awq_to_marlin_zero_points
¶
awq_to_marlin_zero_points(
q_zp_packed: Tensor,
size_k: int,
size_n: int,
num_bits: int,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supported
¶
check_marlin_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
device_capability: Optional[int] = None,
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supports_layer
¶
check_marlin_supports_layer(
layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_marlin_supports_shape
¶
check_marlin_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> tuple[bool, Optional[str]]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
check_moe_marlin_supports_layer
¶
check_moe_marlin_supports_layer(
layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
get_scale_perms
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_is_k_full
¶
marlin_make_empty_g_idx
¶
marlin_make_empty_zp
¶
marlin_make_workspace
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_make_workspace_new
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_moe_permute_scales
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_permute_scales
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_repeat_scales_on_all_ranks
¶
marlin_repeat_scales_on_all_ranks(
act_order: bool, group_size: int, is_row_parallel: bool
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
marlin_sort_g_idx
¶
marlin_zero_points
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
maybe_warn_marlin_atomic_add
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
maybe_warn_marlin_atomic_add_env
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
moe_awq_to_marlin_zero_points
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
query_marlin_supported_quant_types
¶
query_marlin_supported_quant_types(
has_zp: Optional[bool] = None,
include_fp_type: bool = True,
device_capability: Optional[int] = None,
)
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
should_use_atomic_add_reduce
¶
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
verify_marlin_supported
¶
verify_marlin_supported(
quant_type: ScalarType,
group_size: int,
has_zp: bool = False,
) -> None
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
verify_marlin_supports_shape
¶
verify_marlin_supports_shape(
output_size_per_partition: int,
input_size_per_partition: int,
input_size: int,
group_size: int,
) -> None