class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
reorder_batch_threshold: int = 1
natively_supported_next_n_fp4: list[int] = [1, 2]
# TODO (matt): integrate kernel with next_n = 4 support
@classmethod
def get_cudagraph_support(
cls,
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
return AttentionCGSupport.UNIFORM_BATCH
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
scheduler_config = self.vllm_config.scheduler_config
# NOTE(Chen):an estimated max size of flattened_kv. Need to double check.
self.max_prefill_buffer_size = get_max_prefill_buffer_size(self.vllm_config)
self.num_speculative_tokens = (
self.vllm_config.speculative_config.num_speculative_tokens
if self.vllm_config.speculative_config
else 0
)
self.use_fp4_indexer_cache = (
self.vllm_config.attention_config.use_fp4_indexer_cache
)
assert (
current_platform.is_device_capability_family(100)
or not self.use_fp4_indexer_cache
), (
"use_fp4_indexer_cache requires Blackwell datacenter GPUs "
"(sm_10x, e.g. B200/GB200); sm_120 (consumer Blackwell) and "
"earlier architectures are not supported."
)
next_n = self.num_speculative_tokens + 1
self.reorder_batch_threshold += self.num_speculative_tokens
# NOTE(zyongye) fp4 indexer cache only natively supports next_n in
# natively_supported_next_n_fp4; for other next_n values we fall back
# to the flattening path. Outside the SM100 datacenter family the FP8
# paged MQA logits kernel has the same [1, 2] constraint (deepgemm
# smxx_fp8_fp4_paged_mqa_logits.hpp:233), so flatten there too.
self.use_flattening = (
self.use_fp4_indexer_cache
or not current_platform.is_device_capability_family(100)
) and next_n not in self.natively_supported_next_n_fp4
sm_count = num_compute_units(self.device.index)
self.num_sms = sm_count
self.offsets_buffer = torch.arange(
next_n, device=self.device, dtype=torch.int32
)
self.decode_lens_buffer = torch.zeros(
(scheduler_config.max_num_batched_tokens,),
dtype=torch.int32,
device=self.device,
)
if not self.use_flattening and next_n > 1:
# Native MTP: 2D buffer for per-token seq_lens.
self.decode_seq_lens_buffer = torch.zeros(
(scheduler_config.max_num_seqs, next_n),
dtype=torch.int32,
device=self.device,
)
else:
# Flattening or no MTP: 1D buffer for expanded per-token seq_lens.
self.decode_seq_lens_buffer = torch.zeros(
(scheduler_config.max_num_batched_tokens,),
dtype=torch.int32,
device=self.device,
)
self.arange_buffer = torch.arange(
scheduler_config.max_num_seqs * next_n,
dtype=torch.int32,
device=self.device,
)
max_num_blocks_per_req = cdiv(
self.vllm_config.model_config.max_model_len,
self.kv_cache_spec.block_size * get_total_cp_world_size(),
)
self.expanded_block_table_buffer = torch.zeros(
(
scheduler_config.max_num_batched_tokens,
max_num_blocks_per_req,
),
dtype=torch.int32,
device=self.device,
)
# See: DeepGMM/csrc/apis/attention.hpp
self.scheduler_metadata_buffer = torch.empty(
(self.num_sms + 1, 2), dtype=torch.int32, device=self.device
)
# KV compression. Default to 1 for no compression.
self.compress_ratio = 1
# Get compress_ratio for DeepseekV4 support
if isinstance(self.kv_cache_spec, MLAAttentionSpec):
self.compress_ratio = self.kv_cache_spec.compress_ratio
# Pre-allocate buffers for CUDA graph compatibility when
if self.compress_ratio > 1:
# compress_ratio > 1 (DeepseekV4)
# Compressed slot mapping output buffer
self.compressed_slot_mapping_buffer = torch.zeros(
(scheduler_config.max_num_batched_tokens,),
dtype=torch.int64,
device=self.device,
)
# Buffer for compressed seq_lens in decode path
self.expanded_seq_lens_buffer = torch.zeros(
(scheduler_config.max_num_batched_tokens,),
dtype=torch.int32,
device=self.device,
)
def _prepare_decode_tensors(
self,
seq_lens: torch.Tensor,
block_table: torch.Tensor,
decode_lens: torch.Tensor,
decode_lens_cpu: torch.Tensor,
query_start_loc: torch.Tensor,
num_decodes: int,
num_decode_tokens: int,
use_native: bool,
next_n: int,
max_decode_len: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, bool]:
"""Expand seq_lens/block_table/decode_lens for the decode kernels.
Flatten path (not use_native, max_decode_len > 1):
Each multi-token decode request is expanded into individual
single-token entries so the kernel always sees next_n=1.
Native path (use_native or max_decode_len == 1):
Plain decode or spec-decode with 2D per-token context lengths.
Returns (seq_lens, block_table, decode_lens, batch_size, requires_padding).
seq_lens is 1D (batch_size,) for flatten/plain, 2D (B, next_n) for native MTP.
"""
min_decode_len = int(decode_lens_cpu.min().item())
if not use_native and max_decode_len > 1:
assert self.decode_seq_lens_buffer.dim() == 1
if min_decode_len == max_decode_len:
# Uniform decode lengths.
num_decode_tokens = num_decodes * max_decode_len
_prepare_uniform_decode_kernel[(num_decode_tokens,)](
seq_lens,
self.decode_seq_lens_buffer,
block_table,
block_table.stride(0),
self.expanded_block_table_buffer,
self.expanded_block_table_buffer.stride(0),
self.decode_lens_buffer,
max_decode_len,
BLOCK_SIZE=1024,
)
self.decode_seq_lens_buffer[num_decode_tokens:] = 0
seq_lens = self.decode_seq_lens_buffer[:num_decode_tokens]
block_table = self.expanded_block_table_buffer[:num_decode_tokens]
decode_lens = self.decode_lens_buffer[:num_decode_tokens]
return seq_lens, block_table, decode_lens, num_decode_tokens, False
else:
# Variable decode lengths.
# Assume 4 requests with seq_lens [10, 7, 12, 0] (the final req is
# padding) and decode_lens [3, 1, 4, 0] in the below example comments.
# The context lengths are therefore
# [10-3, 7-1, 12-4, 0-0] = [7, 6, 8, 0].
# 3 + 1 + 4 + 0 = 8
actual_expanded = int(decode_lens_cpu.sum().item())
# Fuse expanded_base and expanded_starts into a single
# repeat_interleave:
# seq_len_i = (context_start[b] - query_start_loc[b]) + arange[i] + 1
# where context_start[b] = seq_lens[b] - decode_lens[b].
# Example: offsets = [7-0, 6-3, 8-4, 0-8] = [7, 3, 4, -8]
# expanded_offsets = [7, 7, 7, 3, 4, 4, 4, 4]
# result = [8, 9, 10, 7, 9, 10, 11, 12]
expanded_offsets = torch.repeat_interleave(
seq_lens - decode_lens - query_start_loc,
decode_lens,
output_size=actual_expanded,
)
# [8, 9, 10, 7, 9, 10, 11, 12, ...] where ... is unused buffer space
self.decode_seq_lens_buffer[:actual_expanded] = (
expanded_offsets + self.arange_buffer[:actual_expanded] + 1
)
self.decode_seq_lens_buffer[actual_expanded:] = 0
seq_lens = self.decode_seq_lens_buffer[:num_decode_tokens]
# Give each of the flattened entries the same block table row as the
# original request.
self.expanded_block_table_buffer[:actual_expanded] = (
torch.repeat_interleave(
block_table, decode_lens, dim=0, output_size=actual_expanded
)
)
if actual_expanded < num_decode_tokens:
self.expanded_block_table_buffer[
actual_expanded:num_decode_tokens, 0
] = 0
block_table = self.expanded_block_table_buffer[:num_decode_tokens]
# All reqs now have decode_len=1
self.decode_lens_buffer[:num_decode_tokens] = 1
decode_lens = self.decode_lens_buffer[:num_decode_tokens]
return seq_lens, block_table, decode_lens, num_decode_tokens, False
else:
# Native path: plain decode (next_n==1) or spec decode
# with 2D per-token context lengths (next_n > 1).
#
# When decode_lens are not truly uniform (e.g. some requests have
# decode_len < next_n due to padding or short prefills), the simple
# reshape in sparse_attn_indexer won't work. Use pack_seq_triton
# (requires_padding) instead.
requires_padding = min_decode_len != max_decode_len
if use_native and next_n > 1:
assert self.decode_seq_lens_buffer.dim() == 2
# (B, max_decode_len): token j attends to
# L - max_decode_len + j + 1 KV tokens.
self.decode_seq_lens_buffer[:num_decodes, :max_decode_len] = (
seq_lens.unsqueeze(1)
- max_decode_len
+ 1
+ self.offsets_buffer[:max_decode_len]
)
seq_lens = self.decode_seq_lens_buffer[:num_decodes, :max_decode_len]
return seq_lens, block_table, decode_lens, num_decodes, requires_padding
def build(
self,
common_prefix_len: int,
common_attn_metadata: CommonAttentionMetadata,
fast_build: bool = False,
) -> DeepseekV32IndexerMetadata:
num_reqs = common_attn_metadata.num_reqs
num_tokens = common_attn_metadata.num_actual_tokens
query_start_loc = common_attn_metadata.query_start_loc
query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
seq_lens = common_attn_metadata.seq_lens
slot_mapping = common_attn_metadata.slot_mapping
block_table = common_attn_metadata.block_table_tensor
num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
split_decodes_and_prefills(
common_attn_metadata,
decode_threshold=self.reorder_batch_threshold,
require_uniform=not self.use_flattening,
)
)
assert num_decodes + num_prefills == num_reqs
assert num_decode_tokens + num_prefill_tokens == num_tokens
compressed_slot_mapping = slot_mapping
compressed_seq_lens = seq_lens
if self.compress_ratio > 1:
compressed_slot_mapping = get_compressed_slot_mapping(
num_tokens,
query_start_loc,
seq_lens,
block_table,
self.kv_cache_spec.storage_block_size,
self.compress_ratio,
out=self.compressed_slot_mapping_buffer,
)
compressed_seq_lens = seq_lens // self.compress_ratio
prefill_metadata = None
if num_prefills > 0:
# This CPU value is an upper bound for async-spec extend rows. It
# is safe for chunking/allocation because CUDA metadata below is
# built from exact device seq_lens and gather ignores the tail.
assert common_attn_metadata.seq_lens_cpu_upper_bound is not None
seq_lens_cpu = common_attn_metadata.seq_lens_cpu_upper_bound
compressed_seq_lens_cpu = (
seq_lens_cpu // self.compress_ratio
if self.compress_ratio > 1
else seq_lens_cpu
)
prefill_query_lens_cpu = torch.diff(
query_start_loc_cpu[num_decodes : num_decodes + num_prefills + 1]
)
max_logits_bytes = envs.VLLM_SPARSE_INDEXER_MAX_LOGITS_MB * 1024 * 1024
# Upper bound is exact for prefill rows (the `[num_decodes:]`
# slice below).
assert common_attn_metadata.seq_lens_cpu_upper_bound is not None
seq_lens_cpu = common_attn_metadata.seq_lens_cpu_upper_bound
chunk_specs = split_indexer_prefill_chunks(
compressed_seq_lens_cpu[num_decodes:],
prefill_query_lens_cpu,
self.max_prefill_buffer_size,
max_logits_bytes,
request_offset=num_decodes,
)
chunks = []
for req_slice, query_slice in chunk_specs:
metadata = build_prefill_chunk_metadata(
req_slice.start,
req_slice.stop,
query_start_loc,
query_start_loc_cpu,
seq_lens,
compressed_seq_lens,
compressed_seq_lens_cpu,
common_attn_metadata.block_table_tensor,
self.compress_ratio,
query_slice=query_slice,
skip_kv_gather=query_slice.start > 0,
)
# Skip when total_seq_lens is 0 (i.e., no compressed token).
if metadata is not None:
chunks.append(metadata)
prefill_metadata = DeepseekV32IndexerPrefillMetadata(chunks)
decode_metadata = None
if num_decodes > 0:
torch.diff(
common_attn_metadata.query_start_loc[: num_decodes + 1],
out=self.decode_lens_buffer[:num_decodes],
)
decode_lens = self.decode_lens_buffer[:num_decodes]
decode_lens_cpu = torch.diff(
common_attn_metadata.query_start_loc_cpu[: num_decodes + 1]
)
seq_lens = common_attn_metadata.seq_lens[:num_decodes]
block_table = common_attn_metadata.block_table_tensor[:num_decodes, ...]
max_decode_len = int(decode_lens_cpu.max().item())
next_n = 1 + self.num_speculative_tokens
use_native = not self.use_flattening and max_decode_len <= next_n
seq_lens, block_table, decode_lens, batch_size, requires_padding = (
self._prepare_decode_tensors(
seq_lens=seq_lens,
block_table=block_table,
decode_lens=decode_lens,
decode_lens_cpu=decode_lens_cpu,
query_start_loc=common_attn_metadata.query_start_loc[:num_decodes],
num_decodes=num_decodes,
num_decode_tokens=num_decode_tokens,
use_native=use_native,
next_n=next_n,
max_decode_len=max_decode_len,
)
)
# For DeepseekV4 (compress_ratio > 1), the indexer KV cache stores
# compressed tokens. Convert uncompressed seq_lens to compressed.
if self.compress_ratio > 1:
# True iff seq_lens aliases decode_seq_lens_buffer (flatten or
# native wrote it); False iff it aliases common_attn_metadata.
seq_lens_is_local_view = (use_native and next_n > 1) or (
not use_native and max_decode_len > 1
)
if seq_lens_is_local_view:
seq_lens //= self.compress_ratio
else:
# Copy to avoid mutating shared state; keeps CG address stable.
self.expanded_seq_lens_buffer[:num_decodes] = (
seq_lens // self.compress_ratio
)
self.expanded_seq_lens_buffer[num_decodes:num_decode_tokens] = 0
seq_lens = self.expanded_seq_lens_buffer[:num_decode_tokens]
# Non-MTP: deep_gemm paged MQA logits requires 2D context_lens
# (csrc/apis/attention.hpp). Unsqueeze to (B, 1) so downstream
# kernels see the same (B, next_n) layout as the MTP path.
if seq_lens.dim() == 1:
seq_lens = seq_lens.unsqueeze(-1)
# DeepGEMM is required for the paged MQA logits on CUDA devices
if current_platform.is_cuda() and has_deep_gemm():
self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
seq_lens,
self.kv_cache_spec.storage_block_size,
self.num_sms,
)
decode_metadata = DeepSeekV32IndexerDecodeMetadata(
block_table=block_table,
seq_lens=seq_lens,
decode_lens=decode_lens,
requires_padding=requires_padding,
schedule_metadata=self.scheduler_metadata_buffer,
)
attn_metadata = DeepseekV32IndexerMetadata(
seq_lens=common_attn_metadata.seq_lens,
max_seq_len=common_attn_metadata.max_seq_len,
slot_mapping=compressed_slot_mapping,
num_decodes=num_decodes,
num_decode_tokens=num_decode_tokens,
num_prefills=num_prefills,
num_prefill_tokens=num_prefill_tokens,
prefill=prefill_metadata,
decode=decode_metadata,
)
return attn_metadata