vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector

EngineId `module-attribute` ¶

EngineId = str

GET_META_MSG `module-attribute` ¶

GET_META_MSG = b'get_meta_msg'

ReqId `module-attribute` ¶

ReqId = str

Transfer `module-attribute` ¶

Transfer = tuple[int, float]

logger `module-attribute` ¶

logger = init_logger(__name__)

NixlAgentMetadata ¶

Bases: Struct

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

class NixlAgentMetadata(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        # required for @cached_property.
        dict=True):
    engine_id: str
    agent_metadata: bytes
    kv_caches_base_addr: list[int]
    num_blocks: int
    block_len: int
    attn_backend_name: str

agent_metadata `instance-attribute` ¶

agent_metadata: bytes

attn_backend_name `instance-attribute` ¶

attn_backend_name: str

block_len `instance-attribute` ¶

block_len: int

engine_id `instance-attribute` ¶

engine_id: str

kv_caches_base_addr `instance-attribute` ¶

kv_caches_base_addr: list[int]

num_blocks `instance-attribute` ¶

num_blocks: int

NixlConnector ¶

Bases: KVConnectorBase_V1

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

class NixlConnector(KVConnectorBase_V1):

    def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
        assert vllm_config.kv_transfer_config is not None
        assert vllm_config.kv_transfer_config.engine_id is not None
        self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id

        if role == KVConnectorRole.SCHEDULER:
            self.connector_scheduler: Optional[NixlConnectorScheduler] = \
                NixlConnectorScheduler(vllm_config, self.engine_id)
            self.connector_worker: Optional[NixlConnectorWorker] = None
        elif role == KVConnectorRole.WORKER:
            self.connector_scheduler = None
            self.connector_worker = NixlConnectorWorker(
                vllm_config, self.engine_id)

    ############################################################
    # Scheduler Side Methods
    ############################################################

    def get_num_new_matched_tokens(
            self, request: "Request",
            num_computed_tokens: int) -> tuple[int, bool]:
        assert self.connector_scheduler is not None
        return self.connector_scheduler.get_num_new_matched_tokens(
            request, num_computed_tokens)

    def update_state_after_alloc(self, request: "Request",
                                 blocks: "KVCacheBlocks",
                                 num_external_tokens: int):
        assert self.connector_scheduler is not None
        return self.connector_scheduler.update_state_after_alloc(
            request, blocks, num_external_tokens)

    def build_connector_meta(
        self,
        scheduler_output: SchedulerOutput,
    ) -> KVConnectorMetadata:
        assert self.connector_scheduler is not None
        return self.connector_scheduler.build_connector_meta(scheduler_output)

    def request_finished(
        self,
        request: "Request",
        block_ids: list[int],
    ) -> tuple[bool, Optional[dict[str, Any]]]:
        assert self.connector_scheduler is not None
        return self.connector_scheduler.request_finished(request, block_ids)

    ############################################################
    # Worker Side Methods
    ############################################################
    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
        assert self.connector_worker is not None
        self.connector_worker.register_kv_caches(kv_caches)

    def get_finished(self,
                     finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
        """Get the finished recving and sending requests."""
        assert self.connector_worker is not None
        return self.connector_worker.get_finished()

    def start_load_kv(self, forward_context: "ForwardContext",
                      **kwargs) -> None:
        assert self.connector_worker is not None
        assert isinstance(self._connector_metadata, NixlConnectorMetadata)
        self.connector_worker.start_load_kv(self._connector_metadata)

    def wait_for_layer_load(self, layer_name: str) -> None:
        """NixlConnector does not do layerwise saving."""
        pass

    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
        """NixlConnector does not save explicitly."""
        pass

    def wait_for_save(self):
        """NixlConnector does not save explicitly."""
        pass

connector_scheduler `instance-attribute` ¶

connector_scheduler: Optional[NixlConnectorScheduler] = (
    NixlConnectorScheduler(vllm_config, engine_id)
)

connector_worker `instance-attribute` ¶

connector_worker: Optional[NixlConnectorWorker] = None

engine_id `instance-attribute` ¶

engine_id: EngineId = engine_id

init ¶

__init__(vllm_config: VllmConfig, role: KVConnectorRole)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
    assert vllm_config.kv_transfer_config is not None
    assert vllm_config.kv_transfer_config.engine_id is not None
    self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id

    if role == KVConnectorRole.SCHEDULER:
        self.connector_scheduler: Optional[NixlConnectorScheduler] = \
            NixlConnectorScheduler(vllm_config, self.engine_id)
        self.connector_worker: Optional[NixlConnectorWorker] = None
    elif role == KVConnectorRole.WORKER:
        self.connector_scheduler = None
        self.connector_worker = NixlConnectorWorker(
            vllm_config, self.engine_id)

build_connector_meta ¶

build_connector_meta(
    scheduler_output: SchedulerOutput,
) -> KVConnectorMetadata

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def build_connector_meta(
    self,
    scheduler_output: SchedulerOutput,
) -> KVConnectorMetadata:
    assert self.connector_scheduler is not None
    return self.connector_scheduler.build_connector_meta(scheduler_output)

get_finished ¶

get_finished(
    finished_req_ids: set[str],
) -> tuple[set[str], set[str]]

Get the finished recving and sending requests.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def get_finished(self,
                 finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
    """Get the finished recving and sending requests."""
    assert self.connector_worker is not None
    return self.connector_worker.get_finished()

get_num_new_matched_tokens ¶

get_num_new_matched_tokens(
    request: Request, num_computed_tokens: int
) -> tuple[int, bool]

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def get_num_new_matched_tokens(
        self, request: "Request",
        num_computed_tokens: int) -> tuple[int, bool]:
    assert self.connector_scheduler is not None
    return self.connector_scheduler.get_num_new_matched_tokens(
        request, num_computed_tokens)

register_kv_caches ¶

register_kv_caches(kv_caches: dict[str, Tensor])

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
    assert self.connector_worker is not None
    self.connector_worker.register_kv_caches(kv_caches)

request_finished ¶

request_finished(
    request: Request, block_ids: list[int]
) -> tuple[bool, Optional[dict[str, Any]]]

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def request_finished(
    self,
    request: "Request",
    block_ids: list[int],
) -> tuple[bool, Optional[dict[str, Any]]]:
    assert self.connector_scheduler is not None
    return self.connector_scheduler.request_finished(request, block_ids)

save_kv_layer ¶

save_kv_layer(
    layer_name: str,
    kv_layer: Tensor,
    attn_metadata: AttentionMetadata,
    **kwargs,
) -> None

NixlConnector does not save explicitly.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
                  attn_metadata: "AttentionMetadata", **kwargs) -> None:
    """NixlConnector does not save explicitly."""
    pass

start_load_kv ¶

start_load_kv(
    forward_context: ForwardContext, **kwargs
) -> None

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def start_load_kv(self, forward_context: "ForwardContext",
                  **kwargs) -> None:
    assert self.connector_worker is not None
    assert isinstance(self._connector_metadata, NixlConnectorMetadata)
    self.connector_worker.start_load_kv(self._connector_metadata)

update_state_after_alloc ¶

update_state_after_alloc(
    request: Request,
    blocks: KVCacheBlocks,
    num_external_tokens: int,
)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def update_state_after_alloc(self, request: "Request",
                             blocks: "KVCacheBlocks",
                             num_external_tokens: int):
    assert self.connector_scheduler is not None
    return self.connector_scheduler.update_state_after_alloc(
        request, blocks, num_external_tokens)

wait_for_layer_load ¶

wait_for_layer_load(layer_name: str) -> None

NixlConnector does not do layerwise saving.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def wait_for_layer_load(self, layer_name: str) -> None:
    """NixlConnector does not do layerwise saving."""
    pass

wait_for_save ¶

wait_for_save()

NixlConnector does not save explicitly.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def wait_for_save(self):
    """NixlConnector does not save explicitly."""
    pass

NixlConnectorMetadata ¶

Bases: KVConnectorMetadata

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

class NixlConnectorMetadata(KVConnectorMetadata):

    def __init__(self):
        self.requests: dict[ReqId, ReqMeta] = {}

    def add_new_req(
        self,
        request_id: ReqId,
        local_block_ids: list[int],
        kv_transfer_params: dict[str, Any],
    ):
        self.requests[request_id] = ReqMeta(
            local_block_ids=local_block_ids,
            remote_block_ids=kv_transfer_params["remote_block_ids"],
            remote_engine_id=kv_transfer_params["remote_engine_id"],
            remote_host=kv_transfer_params["remote_host"],
            remote_port=kv_transfer_params["remote_port"],
            # P workers don't need to receive tp_size from proxy here.
            tp_size=kv_transfer_params.get("tp_size", 1),
        )

requests `instance-attribute` ¶

requests: dict[ReqId, ReqMeta] = {}

init ¶

__init__()

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def __init__(self):
    self.requests: dict[ReqId, ReqMeta] = {}

add_new_req ¶

add_new_req(
    request_id: ReqId,
    local_block_ids: list[int],
    kv_transfer_params: dict[str, Any],
)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def add_new_req(
    self,
    request_id: ReqId,
    local_block_ids: list[int],
    kv_transfer_params: dict[str, Any],
):
    self.requests[request_id] = ReqMeta(
        local_block_ids=local_block_ids,
        remote_block_ids=kv_transfer_params["remote_block_ids"],
        remote_engine_id=kv_transfer_params["remote_engine_id"],
        remote_host=kv_transfer_params["remote_host"],
        remote_port=kv_transfer_params["remote_port"],
        # P workers don't need to receive tp_size from proxy here.
        tp_size=kv_transfer_params.get("tp_size", 1),
    )

NixlConnectorScheduler ¶

Implementation of Scheduler side methods

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

class NixlConnectorScheduler:
    """Implementation of Scheduler side methods"""

    def __init__(self, vllm_config: VllmConfig, engine_id: str):
        self.vllm_config = vllm_config
        self.block_size = vllm_config.cache_config.block_size
        self.engine_id: EngineId = engine_id
        self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
        self.side_channel_port = (
            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
            vllm_config.parallel_config.data_parallel_rank *
            vllm_config.parallel_config.tensor_parallel_size)
        logger.info("Initializing NIXL Scheduler %s", engine_id)

        # Requests that need to start recv.
        # New requests are added by update_state_after_alloc in
        # the scheduler. Used to make metadata passed to Worker.
        self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}

    def get_num_new_matched_tokens(
            self, request: "Request",
            num_computed_tokens: int) -> tuple[int, bool]:
        """
        For remote prefill, pull all prompt blocks from remote
        asynchronously relative to engine execution.

        Args:
            request (Request): the request object.
            num_computed_tokens (int): the number of locally
                computed tokens for this request
        Returns:
            * the number of tokens that can be loaded from the 
              external KV cache beyond what is already computed.
            * true if the external KV cache tokens will be loaded
              asynchronously (between scheduler steps).
        """

        params = request.kv_transfer_params
        logger.debug(
            "NIXLConnector get_num_new_matched_tokens: "
            "num_computed_tokens=%s, kv_transfer_params=%s",
            num_computed_tokens, params)

        if params is not None and params.get("do_remote_prefill"):
            # Remote prefill: get all prompt blocks from remote.
            assert num_computed_tokens % self.block_size == 0
            rounded_num_prompt_tokens = round_down(
                len(request.prompt_token_ids), self.block_size)
            count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
            if count > 0:
                return count, True

        # No remote prefill for this request.
        return 0, False

    def update_state_after_alloc(self, request: "Request",
                                 blocks: "KVCacheBlocks",
                                 num_external_tokens: int):

        params = request.kv_transfer_params
        logger.debug(
            "NIXLConnector update_state_after_alloc: "
            "num_external_tokens=%s, kv_transfer_params=%s",
            num_external_tokens, params)

        if params is not None and params.get("do_remote_prefill"):
            if params.get("remote_block_ids"):
                if all(p in params for p in ("remote_engine_id", "remote_host",
                                             "remote_port")):
                    # If remote_blocks and num_external_tokens = 0, we have
                    # a full prefix cache hit on the D worker. We need to call
                    # send_notif in _read_blocks to free the memory on the P.
                    local_block_ids = (blocks.get_unhashed_block_ids()
                                       if num_external_tokens > 0 else [])
                    # Get unhashed blocks to pull from remote.
                    self._reqs_need_recv[request.request_id] = (
                        request, local_block_ids)
                else:
                    logger.warning(
                        "Got invalid KVTransferParams: %s. This "
                        "request will not utilize KVTransfer", params)
            else:
                assert num_external_tokens == 0
            # Only trigger 1 KV transfer per request.
            params["do_remote_prefill"] = False

    def build_connector_meta(
        self,
        scheduler_output: SchedulerOutput,
    ) -> KVConnectorMetadata:
        meta = NixlConnectorMetadata()

        # Loop through scheduled reqs and convert to ReqMeta.
        for req_id, (req, block_ids) in self._reqs_need_recv.items():
            assert req.kv_transfer_params is not None
            meta.add_new_req(
                request_id=req_id,
                local_block_ids=block_ids,
                kv_transfer_params=req.kv_transfer_params,
            )

        # Clear the list once workers start the transfers
        self._reqs_need_recv.clear()

        return meta

    def request_finished(
        self,
        request: "Request",
        block_ids: list[int],
    ) -> tuple[bool, Optional[dict[str, Any]]]:
        """
        Once a request is finished, determine whether request blocks
        should be freed now or will be sent asynchronously and freed later.
        """

        params = request.kv_transfer_params
        logger.debug(
            "NIXLConnector request_finished, request_status=%s, "
            "kv_transfer_params=%s", request.status, params)
        if not params:
            return False, None

        if params.get("do_remote_prefill"):
            # If do_remote_prefill is still True when the request is finished,
            # update_state_after_alloc must not have been called (the request
            # must have been aborted before it was scheduled).
            # To avoid stranding the prefill blocks in the prefill instance,
            # we must add empty block_ids to _reqs_need_recv so that our
            # worker side will notify and free blocks in the prefill instance.
            self._reqs_need_recv[request.request_id] = (request, [])
            params["do_remote_prefill"] = False
            return False, None

        if (not params.get("do_remote_decode")
                or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
            return False, None

        # Get computed blocks.
        all_full = request.num_computed_tokens % self.block_size == 0
        computed_block_ids = block_ids if all_full else block_ids[:-1]

        # If prompt < block_size, no xfer so free blocks immediately.
        delay_free_blocks = len(computed_block_ids) > 0

        return delay_free_blocks, dict(
            do_remote_prefill=True,
            do_remote_decode=False,
            remote_block_ids=computed_block_ids,
            remote_engine_id=self.engine_id,
            remote_host=self.side_channel_host,
            remote_port=self.side_channel_port,
            tp_size=self.vllm_config.parallel_config.tensor_parallel_size)

_reqs_need_recv `instance-attribute` ¶

_reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}

block_size `instance-attribute` ¶

block_size = block_size

engine_id `instance-attribute` ¶

engine_id: EngineId = engine_id

side_channel_host `instance-attribute` ¶

side_channel_host = VLLM_NIXL_SIDE_CHANNEL_HOST

side_channel_port `instance-attribute` ¶

side_channel_port = (
    VLLM_NIXL_SIDE_CHANNEL_PORT
    + data_parallel_rank * tensor_parallel_size
)

vllm_config `instance-attribute` ¶

vllm_config = vllm_config

init ¶

__init__(vllm_config: VllmConfig, engine_id: str)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def __init__(self, vllm_config: VllmConfig, engine_id: str):
    self.vllm_config = vllm_config
    self.block_size = vllm_config.cache_config.block_size
    self.engine_id: EngineId = engine_id
    self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
    self.side_channel_port = (
        envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
        vllm_config.parallel_config.data_parallel_rank *
        vllm_config.parallel_config.tensor_parallel_size)
    logger.info("Initializing NIXL Scheduler %s", engine_id)

    # Requests that need to start recv.
    # New requests are added by update_state_after_alloc in
    # the scheduler. Used to make metadata passed to Worker.
    self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}

build_connector_meta ¶

build_connector_meta(
    scheduler_output: SchedulerOutput,
) -> KVConnectorMetadata

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def build_connector_meta(
    self,
    scheduler_output: SchedulerOutput,
) -> KVConnectorMetadata:
    meta = NixlConnectorMetadata()

    # Loop through scheduled reqs and convert to ReqMeta.
    for req_id, (req, block_ids) in self._reqs_need_recv.items():
        assert req.kv_transfer_params is not None
        meta.add_new_req(
            request_id=req_id,
            local_block_ids=block_ids,
            kv_transfer_params=req.kv_transfer_params,
        )

    # Clear the list once workers start the transfers
    self._reqs_need_recv.clear()

    return meta

get_num_new_matched_tokens ¶

get_num_new_matched_tokens(
    request: Request, num_computed_tokens: int
) -> tuple[int, bool]

For remote prefill, pull all prompt blocks from remote asynchronously relative to engine execution.

Parameters:

Name	Type	Description	Default
`request`	`Request`	the request object.	required
`num_computed_tokens`	`int`	the number of locally computed tokens for this request	required

Returns: * the number of tokens that can be loaded from the external KV cache beyond what is already computed. * true if the external KV cache tokens will be loaded asynchronously (between scheduler steps).

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def get_num_new_matched_tokens(
        self, request: "Request",
        num_computed_tokens: int) -> tuple[int, bool]:
    """
    For remote prefill, pull all prompt blocks from remote
    asynchronously relative to engine execution.

    Args:
        request (Request): the request object.
        num_computed_tokens (int): the number of locally
            computed tokens for this request
    Returns:
        * the number of tokens that can be loaded from the 
          external KV cache beyond what is already computed.
        * true if the external KV cache tokens will be loaded
          asynchronously (between scheduler steps).
    """

    params = request.kv_transfer_params
    logger.debug(
        "NIXLConnector get_num_new_matched_tokens: "
        "num_computed_tokens=%s, kv_transfer_params=%s",
        num_computed_tokens, params)

    if params is not None and params.get("do_remote_prefill"):
        # Remote prefill: get all prompt blocks from remote.
        assert num_computed_tokens % self.block_size == 0
        rounded_num_prompt_tokens = round_down(
            len(request.prompt_token_ids), self.block_size)
        count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
        if count > 0:
            return count, True

    # No remote prefill for this request.
    return 0, False

request_finished ¶

request_finished(
    request: Request, block_ids: list[int]
) -> tuple[bool, Optional[dict[str, Any]]]

Once a request is finished, determine whether request blocks should be freed now or will be sent asynchronously and freed later.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def request_finished(
    self,
    request: "Request",
    block_ids: list[int],
) -> tuple[bool, Optional[dict[str, Any]]]:
    """
    Once a request is finished, determine whether request blocks
    should be freed now or will be sent asynchronously and freed later.
    """

    params = request.kv_transfer_params
    logger.debug(
        "NIXLConnector request_finished, request_status=%s, "
        "kv_transfer_params=%s", request.status, params)
    if not params:
        return False, None

    if params.get("do_remote_prefill"):
        # If do_remote_prefill is still True when the request is finished,
        # update_state_after_alloc must not have been called (the request
        # must have been aborted before it was scheduled).
        # To avoid stranding the prefill blocks in the prefill instance,
        # we must add empty block_ids to _reqs_need_recv so that our
        # worker side will notify and free blocks in the prefill instance.
        self._reqs_need_recv[request.request_id] = (request, [])
        params["do_remote_prefill"] = False
        return False, None

    if (not params.get("do_remote_decode")
            or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
        return False, None

    # Get computed blocks.
    all_full = request.num_computed_tokens % self.block_size == 0
    computed_block_ids = block_ids if all_full else block_ids[:-1]

    # If prompt < block_size, no xfer so free blocks immediately.
    delay_free_blocks = len(computed_block_ids) > 0

    return delay_free_blocks, dict(
        do_remote_prefill=True,
        do_remote_decode=False,
        remote_block_ids=computed_block_ids,
        remote_engine_id=self.engine_id,
        remote_host=self.side_channel_host,
        remote_port=self.side_channel_port,
        tp_size=self.vllm_config.parallel_config.tensor_parallel_size)

update_state_after_alloc ¶

update_state_after_alloc(
    request: Request,
    blocks: KVCacheBlocks,
    num_external_tokens: int,
)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def update_state_after_alloc(self, request: "Request",
                             blocks: "KVCacheBlocks",
                             num_external_tokens: int):

    params = request.kv_transfer_params
    logger.debug(
        "NIXLConnector update_state_after_alloc: "
        "num_external_tokens=%s, kv_transfer_params=%s",
        num_external_tokens, params)

    if params is not None and params.get("do_remote_prefill"):
        if params.get("remote_block_ids"):
            if all(p in params for p in ("remote_engine_id", "remote_host",
                                         "remote_port")):
                # If remote_blocks and num_external_tokens = 0, we have
                # a full prefix cache hit on the D worker. We need to call
                # send_notif in _read_blocks to free the memory on the P.
                local_block_ids = (blocks.get_unhashed_block_ids()
                                   if num_external_tokens > 0 else [])
                # Get unhashed blocks to pull from remote.
                self._reqs_need_recv[request.request_id] = (
                    request, local_block_ids)
            else:
                logger.warning(
                    "Got invalid KVTransferParams: %s. This "
                    "request will not utilize KVTransfer", params)
        else:
            assert num_external_tokens == 0
        # Only trigger 1 KV transfer per request.
        params["do_remote_prefill"] = False

NixlConnectorWorker ¶

Implementation of Worker side methods

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

class NixlConnectorWorker:
    """Implementation of Worker side methods"""

    def __init__(self, vllm_config: VllmConfig, engine_id: str):
        if NixlWrapper is None:
            logger.error("NIXL is not available")
            raise RuntimeError("NIXL is not available")
        logger.info("Initializing NIXL wrapper")
        logger.info("Initializing NIXL worker %s", engine_id)

        # Config.
        self.vllm_config = vllm_config
        self.block_size = vllm_config.cache_config.block_size

        # Agent.
        self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
        # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
        self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)

        # NIXL handshake port.
        # NOTE(rob): Within a DP group, each DP rank gets its own
        # base port (which is sent in the KVTransferParams).
        # Each TP rank listens/queries on the base_port + tp_rank.
        self.side_channel_port: int = (
            envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
            vllm_config.parallel_config.data_parallel_rank *
            vllm_config.parallel_config.tensor_parallel_size)

        # Metadata.
        self.engine_id: EngineId = engine_id
        self.tp_rank = get_tensor_model_parallel_rank()
        self.world_size = get_tensor_model_parallel_world_size()
        self.tp_group = get_tp_group()

        # KV Caches and nixl tracking data.
        self.kv_caches: dict[str, torch.Tensor] = {}

        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
        # rank will still only pull from a single remote TP worker.
        self.kv_caches_base_addr: dict[EngineId, list[int]] = {}

        # Number of NIXL regions. Currently one region per cache
        # (so 1 per layer for MLA, otherwise 2 per layer)
        self.num_regions = 0
        self.num_layers = 0

        # nixl_prepped_dlist_handle.
        self.src_xfer_side_handle: int = 0
        # Map of engine_id -> nixl_prepped_dlist_handle (int)].
        self.dst_xfer_side_handles: dict[EngineId, int] = {}

        # Map of engine_id -> num_blocks. All ranks in the same deployment will
        # have the same number of blocks.
        self.dst_num_blocks: dict[EngineId, int] = {}
        self._registered_descs: list[Any] = []

        # In progress transfers.
        # [req_id -> list[handle]]
        self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)

        # Complete transfer tracker. Used by the rank 0 to track finished
        # transactions on ranks 1 to N-1.
        # [req_id -> count]
        self._done_recving_count: defaultdict[ReqId,
                                              int] = defaultdict(lambda: 0)
        self._done_sending_count: defaultdict[ReqId,
                                              int] = defaultdict(lambda: 0)

        # Background thread for handling new handshake requests.
        self._nixl_handshake_listener_t: Optional[threading.Thread] = None
        # Background thread for initializing new NIXL handshakes.
        self._handshake_initiation_executor = ThreadPoolExecutor(
            # NIXL is not guaranteed to be thread-safe, limit 1 worker.
            max_workers=1,
            thread_name_prefix="vllm-nixl-handshake-initiator")
        self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
        self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
        # Protects _handshake_futures and _remote_agents.
        self._handshake_lock = threading.RLock()

        self.vllm_config = vllm_config
        self.block_size = vllm_config.cache_config.block_size
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config

        # TODO(mgoin): remove this once we have hybrid memory allocator
        # Optimization for models with local attention (Llama 4)
        # List of block window sizes for each layer for local attention
        self.block_window_per_layer: list[Optional[int]] = []
        self.use_mla = self.model_config.use_mla

        backend = get_attn_backend(self.model_config.get_head_size(),
                                   self.model_config.dtype,
                                   self.cache_config.cache_dtype,
                                   self.block_size,
                                   self.model_config.is_attention_free,
                                   use_mla=self.use_mla)
        self.backend_name = backend.get_name()
        attn_backend = backend_name_to_enum(self.backend_name)
        self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
        logger.debug("Detected attention backend %s", self.backend_name)

        self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
        # With heterogeneous TP, P must wait for all assigned D TP workers to
        # finish reading before safely freeing the blocks.
        self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)

    def __del__(self):
        """Cleanup background threads on destruction."""
        self._handshake_initiation_executor.shutdown(wait=False)
        if self._nixl_handshake_listener_t:
            self._nixl_handshake_listener_t.join(timeout=0)

    @staticmethod
    def _nixl_handshake_listener(metadata: NixlAgentMetadata,
                                 ready_event: threading.Event, base_port: int,
                                 tp_rank: int):
        """Background thread for getting new NIXL handshakes."""
        # NOTE(rob): this is a simple implementation. We will move
        # to a better approach via HTTP endpoint soon.

        encoder = msgspec.msgpack.Encoder()
        encoded_data = encoder.encode(metadata)
        size_in_bytes = len(encoded_data)
        logger.debug("Size of encoded NixlAgentMetadata: %s bytes",
                     str(size_in_bytes))

        # Listen for new requests for metadata.
        host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
        path = make_zmq_path("tcp", host, base_port + tp_rank)
        logger.debug("Starting listening on path: %s", path)
        with zmq_ctx(zmq.ROUTER, path) as sock:
            ready_event.set()
            while True:
                identity, _, msg = sock.recv_multipart()
                if msg != GET_META_MSG:
                    logger.warning(
                        "Connection listener got unexpected message %s", msg)
                sock.send_multipart((identity, b"", encoded_data))

    def _nixl_handshake(self, host: str, port: int,
                        remote_tp_size: int) -> dict[int, str]:
        """Do a NIXL handshake with a remote instance."""

        start_time = time.perf_counter()

        # NOTE(rob): we need each rank to have a unique port. This is
        # a hack to keep us moving. We will switch when moving to etcd
        # or where we have a single ZMQ socket in the scheduler.

        def handshake(path: str, rank: int) -> str:
            # Send query for the request.
            with zmq_ctx(zmq.REQ, path) as sock:
                sock.send(GET_META_MSG)
                metadata_bytes = sock.recv()
                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
                metadata = decoder.decode(metadata_bytes)
                got_metadata_time = time.perf_counter()

                # Register Remote agent.
                remote_agent_name = self.add_remote_agent(
                    metadata, rank, remote_tp_size)
                setup_agent_time = time.perf_counter()

                logger.debug("NIXL handshake: get metadata took: %s",
                             got_metadata_time - start_time)
                logger.debug("NIXL handshake: add agent took: %s",
                             setup_agent_time - got_metadata_time)
                return remote_agent_name

        # Handshake only with the remote TP rank that current local rank will
        # pull from. With homogeneous TP it happens to be the same rank_i.
        tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
        p_remote_rank = self.tp_rank // tp_ratio
        path = make_zmq_path("tcp", host, port + p_remote_rank)
        logger.debug("Querying metadata on path: %s at remote rank %s", path,
                     p_remote_rank)
        # Remote rank -> agent name.
        return {p_remote_rank: handshake(path, p_remote_rank)}

    def _background_nixl_handshake(self, req_id: str,
                                   remote_engine_id: EngineId, meta: ReqMeta):
        # Do NIXL handshake in background and add to _ready_requests when done.
        fut = self._handshake_futures.get(remote_engine_id)
        if fut is None:
            fut = self._handshake_initiation_executor.submit(
                self._nixl_handshake, meta.remote_host, meta.remote_port,
                meta.tp_size)
            self._handshake_futures[remote_engine_id] = fut

            def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
                with self._handshake_lock:
                    del self._handshake_futures[eid]
                    try:
                        self._remote_agents[eid] = f.result()
                    except Exception:
                        logger.exception("Handshake with %s failed", eid)

            fut.add_done_callback(done_callback)

        # TODO: handle failure state of future in the
        # callback, we want to fail the request in this case.
        def request_ready(_f: Future[Any], entry=(req_id, meta)):
            self._ready_requests.put(entry)

        fut.add_done_callback(request_ready)

    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
        """Register the KV Cache data in nixl."""

        _, first_kv_cache = next(iter(kv_caches.items()))
        kv_elem_size = first_kv_cache.element_size()

        # TODO(tms): Find a more robust way to detect and handle MLA
        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
        # KV memory layout is HND, as opposed to the default NHD. Note that it
        # will only affects the strides. For MLA instead, we make require no
        # such thing and resort to the standard layout.
        use_mla = len(first_kv_cache.shape) == 3
        assert use_mla == self.use_mla

        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
        # once it goes live, as a single kv layout is expected for xfers.
        if use_mla:
            # MLA case.
            self.num_blocks = first_kv_cache.shape[0]
            block_rank = 2  # [block_size, latent_dim]
            block_shape = first_kv_cache.shape[-block_rank:]
            block_size, kv_latent_dim = block_shape
            self.slot_size_bytes = kv_elem_size * kv_latent_dim
        else:
            # [2 (k and v), num_blocks, ...]
            if self._use_flashinfer:
                # FlashInfer swaps 2<->num_blocks dimensions.
                self.num_blocks = first_kv_cache.shape[0]
                block_rank = 4  # [2, block_size, kv_heads, head_dim]
            else:
                self.num_blocks = first_kv_cache.shape[1]
                block_rank = 3  # [block_size, kv_heads, head_dim]
            block_shape = first_kv_cache.shape[-block_rank:]
            block_size, n_kv_heads, head_dim = block_shape[-3:]
            # head size in bytes.
            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
        assert block_size == self.block_size
        # TODO(tms): self.block_len needs to be per-layer for sliding window,
        # hybrid attn, etc
        # block size in bytes
        self.block_len = kv_elem_size * math.prod(block_shape)
        logger.info(
            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
            self.num_blocks, block_shape, first_kv_cache.shape)
        self.dst_num_blocks[self.engine_id] = self.num_blocks
        self.kv_caches = kv_caches
        kv_caches_base_addr = []
        caches_data = []

        # Note(tms): I modified this from the original region setup code.
        # K and V are now in different regions. Advantage is that we can
        # elegantly support MLA and any cases where the K and V tensors
        # are non-contiguous (it's not locally guaranteed that they will be)
        # Disadvantage is that the encoded NixlAgentMetadata is now larger
        # (roughly 8KB vs 5KB).
        # Conversely for FlashInfer, K and V are transferred in the same tensor
        # to better exploit the memory layout (ie num_blocks is the first dim).
        for cache_or_caches in kv_caches.values():
            # Normalize to always be a list of caches
            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
                else cache_or_caches
            for cache in cache_list:
                base_addr = cache.data_ptr()
                region_len = self.num_blocks * self.block_len
                caches_data.append(
                    (base_addr, region_len, cache.device.index, ""))
                kv_caches_base_addr.append(base_addr)
        self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
        self.num_regions = len(caches_data)
        self.num_layers = len(self.kv_caches.keys())

        # TODO(mgoin): remove this once we have hybrid memory allocator
        # Optimization for models with local attention (Llama 4)
        if self.vllm_config.model_config.hf_config.model_type == "llama4":
            from transformers import Llama4TextConfig
            assert isinstance(self.vllm_config.model_config.hf_text_config,
                              Llama4TextConfig)
            llama4_config = self.vllm_config.model_config.hf_text_config
            no_rope_layers = llama4_config.no_rope_layers
            chunk_size = llama4_config.attention_chunk_size
            chunk_block_size = math.ceil(chunk_size / self.block_size)
            for layer_idx in range(self.num_layers):
                # no_rope_layers[layer_idx] == 0 means NoPE (global)
                # Any other value means RoPE (local chunked)
                is_local_attention = no_rope_layers[layer_idx] != 0
                block_window = chunk_block_size if is_local_attention else None
                self.block_window_per_layer.append(block_window)
            logger.debug("Llama 4 block window per layer mapping: %s",
                         self.block_window_per_layer)
            assert len(self.block_window_per_layer) == self.num_layers

        descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
        logger.debug("Registering descs: %s", caches_data)
        self.nixl_wrapper.register_memory(descs)
        logger.debug("Done registering descs")
        self._registered_descs.append(descs)

        # Register local/src descr for NIXL xfer.
        blocks_data = []
        for base_addr in self.kv_caches_base_addr[self.engine_id]:
            # NOTE With heter-TP, more blocks are prepared than what are
            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
            # could create fewer, but then _get_block_descs_ids needs to
            # select agent_meta.num_blocks instead of self.num_blocks for
            # local descr, and that makes handling regular flow less clean.
            for block_id in range(self.num_blocks):
                block_offset = block_id * self.block_len
                addr = base_addr + block_offset
                # (addr, len, device id)
                blocks_data.append((addr, self.block_len, self.tp_rank))
        logger.debug("Created %s blocks for src engine %s and rank %s",
                     len(blocks_data), self.engine_id, self.tp_rank)

        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
        # NIXL_INIT_AGENT to be used for preparations of local descs.
        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
            "NIXL_INIT_AGENT", descs)

        # After KV Caches registered, listen for new connections.
        metadata = NixlAgentMetadata(
            engine_id=self.engine_id,
            agent_metadata=self.nixl_wrapper.get_agent_metadata(),
            kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
            num_blocks=self.num_blocks,
            block_len=self.block_len,
            attn_backend_name=self.backend_name)
        ready_event = threading.Event()
        self._nixl_handshake_listener_t = threading.Thread(
            target=self._nixl_handshake_listener,
            args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
            daemon=True,
            name="nixl_handshake_listener")
        self._nixl_handshake_listener_t.start()
        ready_event.wait()  # Wait for listener ZMQ socket to be ready.

    def add_remote_agent(self,
                         nixl_agent_meta: NixlAgentMetadata,
                         remote_tp_rank: int = 0,
                         remote_tp_size: int = 1) -> str:
        """
        Add the remote NIXL agent and prepare the descriptors for reading cache
        blocks from remote.

        In particular, handle both homogeneous and heterogeneous TP. The former
        requires local rank_i to read from remote rank_i. 
        The latter, assuming D.world_size > P.world_size, requires that two or 
        more local TP worker share the xfer from a single TP worker.

        Here's an example:

        rank_offset     p_remote_tp_rank
        (kv split no)    
        --------------------------------
            0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
                                                                        /
            1                 0      Worker1  ---- 2nd half of KV -----/

            0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
                                                                        /
            1                 1      Worker3  ---- 2nd half of KV -----/


                                Decoder TP workers                     Prefix TP workers
                                  (world_size=4)                         (world_size=2)
                                                 tp_ratio = 4 // 2 = 2                  

        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
        then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
        first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   

        Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.

        Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
        so that the whole cache is shared by "tp_ratio" D TP workers.
        """ # noqa: E501
        engine_id = nixl_agent_meta.engine_id
        # TODO re-evaluate refreshing for scaling/recovery
        if remote_tp_rank in self._remote_agents.get(engine_id, {}):
            return self._remote_agents[engine_id][remote_tp_rank]

        if engine_id in self._tp_size:
            assert self._tp_size[engine_id] == remote_tp_size
        else:
            self._tp_size[engine_id] = remote_tp_size
        # We may eventually enable this after asserting equality in cache
        # layout and close outputs.
        assert nixl_agent_meta.attn_backend_name == self.backend_name

        remote_agent_name = self.nixl_wrapper.add_remote_agent(
            nixl_agent_meta.agent_metadata)

        # Number of D TP workers reading from a single P TP worker. This is
        # 1 when P and D `--tensor-parallel-size` match.
        tp_ratio = divide(self._tp_size[self.engine_id],
                          self._tp_size[engine_id])
        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"

        # Handle tp_size>num_kv_heads: replicate KV cache.
        total_num_kv_heads = self.model_config.get_total_num_kv_heads()
        is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1

        if self.use_mla or is_kv_replicated:
            # With MLA the only difference is in the number of blocks.
            remote_block_size = nixl_agent_meta.block_len // (
                self.slot_size_bytes)
            assert self.block_len == nixl_agent_meta.block_len
        else:
            remote_block_size = nixl_agent_meta.block_len // (
                self.slot_size_bytes * tp_ratio)
            if self._use_flashinfer:
                # Account for joint KV in FlashInfer.
                remote_block_size //= 2

            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
                "Remote P worker KV layer cache must be of shape [2, N, "
                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
            )

        assert self.block_size == remote_block_size, (
            "Remote P worker with different block size is not supported "
            f"{self.block_size=} {remote_block_size=}")

        # Create dst descs and xfer side handles. TP workers have same #blocks.
        if engine_id in self.dst_num_blocks:
            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
        else:
            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks

        blocks_data = []
        # With homogeneous TP, D pulls the whole kv cache from corresponding
        # rank. With heterogeneous TP, prepare the descriptors by splitting the
        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
        # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
        # Only register the remote's descriptors if current rank pulls from it.
        self.kv_caches_base_addr[
            engine_id] = nixl_agent_meta.kv_caches_base_addr
        rank_offset = self.tp_rank % tp_ratio * self.block_len \
            if not (self.use_mla or is_kv_replicated) else 0
        # Register all remote blocks, but only the corresponding kv heads.
        for base_addr in nixl_agent_meta.kv_caches_base_addr:
            for block_id in range(nixl_agent_meta.num_blocks):
                block_offset = block_id * nixl_agent_meta.block_len
                # For each block, grab the heads chunk belonging to rank_i
                # of size remote_nheads // tp_ratio, which correspond to
                # self.block_len == remote_block_len//tp_ratio bytes.
                addr = base_addr + block_offset + rank_offset
                # (addr, len, device id)
                blocks_data.append((addr, self.block_len, remote_tp_rank))
        logger.debug(
            "Created %s blocks for dst engine %s with remote rank %s and "
            "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
            self.tp_rank)

        # Register with NIXL.
        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
        self.dst_xfer_side_handles[
            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
                remote_agent_name, descs)

        return remote_agent_name

    def get_finished(self) -> tuple[set[str], set[str]]:
        """
        Get requests that are done sending or recving.

        In TP>1 setup, each rank exchanges KVs with its counterpart
        ranks independently. get_finished() runs in a worker creates
        the done_sending and done_recving sets that are sent to the
        scheduler via ModelRunnerOutput by Rank 0. To ensure trnxs
        are done before adding to finished, Ranks 1 to N-1 communicate
        to Rank 0 once their transaction is done + Rank 0 returns
        finished sets to Scheduler only once all ranks are done.
        """
        done_sending = self._get_new_notifs()
        done_recving = self._pop_done_transfers(self._recving_transfers)
        if len(done_sending) > 0 or len(done_recving) > 0:
            logger.debug(
                "Rank %s, get_finished: %s requests done sending "
                "and %s requests done recving", self.tp_rank,
                len(done_sending), len(done_recving))

        if self.world_size == 1:
            return done_sending, done_recving

        # Rank 0: get finished from all other ranks.
        if self.tp_rank == 0:
            for req_id in done_sending:
                self._done_sending_count[req_id] += 1
            for req_id in done_recving:
                self._done_recving_count[req_id] += 1

            # Keep track of how many other ranks have finished.
            other_ranks_finished_ids: list[str] = []
            for i in range(1, self.world_size):
                other_ranks_finished_ids.extend(
                    self.tp_group.recv_object(src=i))
            for req_id in other_ranks_finished_ids:
                if (req_id in self._done_recving_count
                        or req_id in self._recving_transfers):
                    self._done_recving_count[req_id] += 1
                else:
                    self._done_sending_count[req_id] += 1

            # Return ids that finished on all ranks to the scheduler.
            all_done_recving: set[str] = set()
            for req_id in list(self._done_recving_count.keys()):
                if self._done_recving_count[req_id] == self.world_size:
                    del self._done_recving_count[req_id]
                    all_done_recving.add(req_id)

            all_done_sending: set[str] = set()
            for req_id in list(self._done_sending_count.keys()):
                if self._done_sending_count[req_id] == self.world_size:
                    del self._done_sending_count[req_id]
                    all_done_sending.add(req_id)

            return all_done_sending, all_done_recving

        # Ranks 1 to N-1: send finished ids to Rank 0.
        else:
            finished_req_ids = list(done_recving.union(done_sending))
            self.tp_group.send_object(finished_req_ids, dst=0)

            # Unused as only Rank 0 results are sent to scheduler.
            return done_sending, done_recving

    def _get_new_notifs(self) -> set[str]:
        """
        Get req_ids which got a remote xfer message. When multiple consumers
        are reading from the same producer (heterogeneous TP scenario), wait
        for all consumers to be done pulling.
        """
        notified_req_ids: set[str] = set()
        for notifs in self.nixl_wrapper.get_new_notifs().values():
            for notif in notifs:
                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
                self.consumer_notification_counts_by_req[req_id] += 1
                # Wait all consumers (D) to be done reading before freeing.
                if self.consumer_notification_counts_by_req[req_id] == int(
                        tp_ratio):
                    notified_req_ids.add(req_id)
                    del self.consumer_notification_counts_by_req[req_id]
        return notified_req_ids

    def _pop_done_transfers(
            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
        """
        Pop completed xfers by checking for DONE state.
        Args:
            transfers: dict of req_id -> list[running_xfer]
        Returns:
            set of req_ids that have all done xfers
        """
        done_req_ids: set[str] = set()
        for req_id, handles in list(transfers.items()):
            in_progress = False
            for handle, _xfer_stime in handles:
                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                if xfer_state == "DONE":
                    self.nixl_wrapper.release_xfer_handle(handle)
                elif xfer_state == "PROC":
                    in_progress = True
                    continue
                else:
                    raise RuntimeError("Transfer failed with state %s",
                                       xfer_state)
            if not in_progress:
                done_req_ids.add(req_id)
                del transfers[req_id]
        return done_req_ids

    def start_load_kv(self, metadata: NixlConnectorMetadata):
        """
        Start loading by triggering non-blocking nixl_xfer.
        We check for these trnxs to complete in each step().
        """
        for req_id, meta in metadata.requests.items():
            remote_engine_id = meta.remote_engine_id
            logger.debug(
                "start_load_kv for request %s from remote engine %s. "
                "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
                remote_engine_id, len(meta.local_block_ids),
                len(meta.remote_block_ids))
            if remote_engine_id not in self._remote_agents:
                # Initiate handshake with remote engine to exchange metadata.
                with self._handshake_lock:
                    if remote_engine_id not in self._remote_agents:
                        self._background_nixl_handshake(
                            req_id, remote_engine_id, meta)
                        continue

            # Handshake already completed, start async read xfer.
            self._read_blocks_for_req(req_id, meta)

        # Start transfers for requests whose handshakes have now finished.
        while not self._ready_requests.empty():
            self._read_blocks_for_req(*self._ready_requests.get_nowait())

    def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
        logger.debug(
            "Remote agent %s available, calling _read_blocks for req %s",
            meta.remote_engine_id, req_id)
        self._read_blocks(
            request_id=req_id,
            dst_engine_id=meta.remote_engine_id,
            local_block_ids=meta.local_block_ids,
            remote_block_ids=meta.remote_block_ids,
        )

    def _read_blocks(self, local_block_ids: list[int],
                     remote_block_ids: list[int], dst_engine_id: str,
                     request_id: str):
        # NOTE(rob): having the staging blocks be on the READER side is
        # not going to work well (since we will have to call rearrange tensors).
        # after we detect the txn is complete (which means we cannot make the
        # read trxn async easily). If we want to make "READ" happen cleanly,
        # then we will need to have the staging blocks on the remote side.

        # NOTE(rob): according to nvidia the staging blocks are used to
        # saturate IB with heterogeneous TP sizes. We should remove the staging
        # blocks until we are ready.

        # Number of D TP workers that will read from dst P. Propagate tp_ratio
        # on notification so that dst worker can wait before freeing blocks.
        tp_ratio = self._tp_size[
            self.engine_id] // self._tp_size[dst_engine_id]
        notif_id = f"{request_id}:{tp_ratio}".encode()

        # Full prefix cache hit: do not need to read remote blocks,
        # just notify P worker that we have the blocks we need.
        num_local_blocks = len(local_block_ids)
        if num_local_blocks == 0:
            remote_rank = self.tp_rank // tp_ratio
            agent_name = self._remote_agents[dst_engine_id][remote_rank]
            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
            return

        # Partial prefix cache hit: just read uncomputed blocks.
        num_remote_blocks = len(remote_block_ids)
        assert num_local_blocks <= num_remote_blocks
        if num_local_blocks < num_remote_blocks:
            remote_block_ids = remote_block_ids[-num_local_blocks:]

        # Get side handles.
        local_xfer_side_handle = self.src_xfer_side_handle
        remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]

        # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
        # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
        # workers will issue xfers to parts of the P worker remote kv caches.

        # Get descs ids.
        local_block_descs_ids: list[int] = []
        remote_block_descs_ids: list[int] = []
        if not self.block_window_per_layer:
            # Default case: assume global attention
            remote_block_descs_ids = self._get_block_descs_ids(
                dst_engine_id, remote_block_ids)
            local_block_descs_ids = self._get_block_descs_ids(
                self.engine_id, local_block_ids)
        else:
            # TODO(mgoin): remove this once we have hybrid memory allocator
            # Optimization for models with local attention (Llama 4)
            for layer_idx, block_window in enumerate(
                    self.block_window_per_layer):
                # For each layer:
                if block_window is None:
                    # If not chunked, we just use the
                    # full block lists (global attention)
                    layer_local_block_ids = local_block_ids
                    layer_remote_block_ids = remote_block_ids
                else:
                    # If chunked, get the last block_window blocks
                    layer_local_block_ids = local_block_ids[-block_window:]
                    layer_remote_block_ids = remote_block_ids[-block_window:]

                # Get descs ids for the layer.
                layer_local_desc_ids = self._get_block_descs_ids(
                    self.engine_id, layer_local_block_ids, layer_idx)
                layer_remote_desc_ids = self._get_block_descs_ids(
                    dst_engine_id, layer_remote_block_ids, layer_idx)

                local_block_descs_ids.extend(layer_local_desc_ids)
                remote_block_descs_ids.extend(layer_remote_desc_ids)

        assert len(local_block_descs_ids) == len(remote_block_descs_ids)

        # Prepare transfer with Nixl.
        handle = self.nixl_wrapper.make_prepped_xfer(
            "READ",
            local_xfer_side_handle,
            local_block_descs_ids,
            remote_xfer_side_handle,
            remote_block_descs_ids,
            notif_msg=notif_id,
        )

        # Begin async xfer.
        self.nixl_wrapper.transfer(handle)

        # Use handle to check completion in future step().
        # TODO (NickLucche) surface xfer elapsed time
        self._recving_transfers[request_id].append(
            (handle, time.perf_counter()))

    def _get_block_descs_ids(self,
                             engine_id: str,
                             block_ids: list[int],
                             layer_idx: Optional[int] = None) -> list[int]:
        """
        Get the descs ids for a set of block ids.
        If layer_idx is provided, we use the region_ids for the given layer.
        Otherwise, we use all regions.
        """
        if layer_idx is None:
            region_ids = range(self.num_regions)
        else:
            assert layer_idx < self.num_layers
            if self.num_layers < self.num_regions:
                # If we have more regions than layers, we assume that
                # the regions are organized as [K0, V0, K1, V1, ...]
                # and we select K_i and V_i
                assert 2 * self.num_layers == self.num_regions
                region_ids = range(2 * layer_idx, 2 * layer_idx + 2)
            else:
                # Otherwise, we assume we have MLA and select i-th layer
                assert self.num_layers == self.num_regions
                region_ids = range(layer_idx, layer_idx + 1)

        num_blocks = self.dst_num_blocks[engine_id]

        # Compute the desc ids for each block.
        descs_ids: list[int] = []
        for reg_id in region_ids:
            for block_id in block_ids:
                descs_ids.append(reg_id * num_blocks + block_id)
        return descs_ids

_done_recving_count `instance-attribute` ¶

_done_recving_count: defaultdict[ReqId, int] = defaultdict(
    lambda: 0
)

_done_sending_count `instance-attribute` ¶

_done_sending_count: defaultdict[ReqId, int] = defaultdict(
    lambda: 0
)

_handshake_futures `instance-attribute` ¶

_handshake_futures: dict[
    EngineId, Future[dict[int, str]]
] = {}

_handshake_initiation_executor `instance-attribute` ¶

_handshake_initiation_executor = ThreadPoolExecutor(
    max_workers=1,
    thread_name_prefix="vllm-nixl-handshake-initiator",
)

_handshake_lock `instance-attribute` ¶

_handshake_lock = RLock()

_nixl_handshake_listener_t `instance-attribute` ¶

_nixl_handshake_listener_t: Optional[Thread] = None

_ready_requests `instance-attribute` ¶

_ready_requests = Queue[tuple[ReqId, ReqMeta]]()

_recving_transfers `instance-attribute` ¶

_recving_transfers = defaultdict[ReqId, list[Transfer]](
    list
)

_registered_descs `instance-attribute` ¶

_registered_descs: list[Any] = []

_remote_agents `instance-attribute` ¶

_remote_agents: dict[EngineId, dict[int, str]] = (
    defaultdict(dict)
)

_tp_size `instance-attribute` ¶

_tp_size: dict[EngineId, int] = {engine_id: world_size}

_use_flashinfer `instance-attribute` ¶

_use_flashinfer = attn_backend == FLASHINFER_VLLM_V1

backend_name `instance-attribute` ¶

backend_name = get_name()

block_size `instance-attribute` ¶

block_size = block_size

block_window_per_layer `instance-attribute` ¶

block_window_per_layer: list[Optional[int]] = []

cache_config `instance-attribute` ¶

cache_config = cache_config

consumer_notification_counts_by_req `instance-attribute` ¶

consumer_notification_counts_by_req = defaultdict[
    ReqId, int
](int)

dst_num_blocks `instance-attribute` ¶

dst_num_blocks: dict[EngineId, int] = {}

dst_xfer_side_handles `instance-attribute` ¶

dst_xfer_side_handles: dict[EngineId, int] = {}

engine_id `instance-attribute` ¶

engine_id: EngineId = engine_id

kv_caches `instance-attribute` ¶

kv_caches: dict[str, Tensor] = {}

kv_caches_base_addr `instance-attribute` ¶

kv_caches_base_addr: dict[EngineId, list[int]] = {}

model_config `instance-attribute` ¶

model_config = model_config

nixl_wrapper `instance-attribute` ¶

nixl_wrapper = nixl_agent(str(uuid4()), None)

num_layers `instance-attribute` ¶

num_layers = 0

num_regions `instance-attribute` ¶

num_regions = 0

side_channel_port `instance-attribute` ¶

side_channel_port: int = (
    VLLM_NIXL_SIDE_CHANNEL_PORT
    + data_parallel_rank * tensor_parallel_size
)

src_xfer_side_handle `instance-attribute` ¶

src_xfer_side_handle: int = 0

tp_group `instance-attribute` ¶

tp_group = get_tp_group()

tp_rank `instance-attribute` ¶

tp_rank = get_tensor_model_parallel_rank()

use_mla `instance-attribute` ¶

use_mla = use_mla

vllm_config `instance-attribute` ¶

vllm_config = vllm_config

world_size `instance-attribute` ¶

world_size = get_tensor_model_parallel_world_size()

del ¶

__del__()

Cleanup background threads on destruction.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def __del__(self):
    """Cleanup background threads on destruction."""
    self._handshake_initiation_executor.shutdown(wait=False)
    if self._nixl_handshake_listener_t:
        self._nixl_handshake_listener_t.join(timeout=0)

init ¶

__init__(vllm_config: VllmConfig, engine_id: str)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def __init__(self, vllm_config: VllmConfig, engine_id: str):
    if NixlWrapper is None:
        logger.error("NIXL is not available")
        raise RuntimeError("NIXL is not available")
    logger.info("Initializing NIXL wrapper")
    logger.info("Initializing NIXL worker %s", engine_id)

    # Config.
    self.vllm_config = vllm_config
    self.block_size = vllm_config.cache_config.block_size

    # Agent.
    self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
    # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
    self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)

    # NIXL handshake port.
    # NOTE(rob): Within a DP group, each DP rank gets its own
    # base port (which is sent in the KVTransferParams).
    # Each TP rank listens/queries on the base_port + tp_rank.
    self.side_channel_port: int = (
        envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
        vllm_config.parallel_config.data_parallel_rank *
        vllm_config.parallel_config.tensor_parallel_size)

    # Metadata.
    self.engine_id: EngineId = engine_id
    self.tp_rank = get_tensor_model_parallel_rank()
    self.world_size = get_tensor_model_parallel_world_size()
    self.tp_group = get_tp_group()

    # KV Caches and nixl tracking data.
    self.kv_caches: dict[str, torch.Tensor] = {}

    # Map of engine_id -> kv_caches_base_addr. For TP case, each local
    # rank will still only pull from a single remote TP worker.
    self.kv_caches_base_addr: dict[EngineId, list[int]] = {}

    # Number of NIXL regions. Currently one region per cache
    # (so 1 per layer for MLA, otherwise 2 per layer)
    self.num_regions = 0
    self.num_layers = 0

    # nixl_prepped_dlist_handle.
    self.src_xfer_side_handle: int = 0
    # Map of engine_id -> nixl_prepped_dlist_handle (int)].
    self.dst_xfer_side_handles: dict[EngineId, int] = {}

    # Map of engine_id -> num_blocks. All ranks in the same deployment will
    # have the same number of blocks.
    self.dst_num_blocks: dict[EngineId, int] = {}
    self._registered_descs: list[Any] = []

    # In progress transfers.
    # [req_id -> list[handle]]
    self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)

    # Complete transfer tracker. Used by the rank 0 to track finished
    # transactions on ranks 1 to N-1.
    # [req_id -> count]
    self._done_recving_count: defaultdict[ReqId,
                                          int] = defaultdict(lambda: 0)
    self._done_sending_count: defaultdict[ReqId,
                                          int] = defaultdict(lambda: 0)

    # Background thread for handling new handshake requests.
    self._nixl_handshake_listener_t: Optional[threading.Thread] = None
    # Background thread for initializing new NIXL handshakes.
    self._handshake_initiation_executor = ThreadPoolExecutor(
        # NIXL is not guaranteed to be thread-safe, limit 1 worker.
        max_workers=1,
        thread_name_prefix="vllm-nixl-handshake-initiator")
    self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
    self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
    # Protects _handshake_futures and _remote_agents.
    self._handshake_lock = threading.RLock()

    self.vllm_config = vllm_config
    self.block_size = vllm_config.cache_config.block_size
    self.model_config = vllm_config.model_config
    self.cache_config = vllm_config.cache_config

    # TODO(mgoin): remove this once we have hybrid memory allocator
    # Optimization for models with local attention (Llama 4)
    # List of block window sizes for each layer for local attention
    self.block_window_per_layer: list[Optional[int]] = []
    self.use_mla = self.model_config.use_mla

    backend = get_attn_backend(self.model_config.get_head_size(),
                               self.model_config.dtype,
                               self.cache_config.cache_dtype,
                               self.block_size,
                               self.model_config.is_attention_free,
                               use_mla=self.use_mla)
    self.backend_name = backend.get_name()
    attn_backend = backend_name_to_enum(self.backend_name)
    self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
    logger.debug("Detected attention backend %s", self.backend_name)

    self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
    # With heterogeneous TP, P must wait for all assigned D TP workers to
    # finish reading before safely freeing the blocks.
    self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)

_background_nixl_handshake ¶

_background_nixl_handshake(
    req_id: str, remote_engine_id: EngineId, meta: ReqMeta
)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _background_nixl_handshake(self, req_id: str,
                               remote_engine_id: EngineId, meta: ReqMeta):
    # Do NIXL handshake in background and add to _ready_requests when done.
    fut = self._handshake_futures.get(remote_engine_id)
    if fut is None:
        fut = self._handshake_initiation_executor.submit(
            self._nixl_handshake, meta.remote_host, meta.remote_port,
            meta.tp_size)
        self._handshake_futures[remote_engine_id] = fut

        def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
            with self._handshake_lock:
                del self._handshake_futures[eid]
                try:
                    self._remote_agents[eid] = f.result()
                except Exception:
                    logger.exception("Handshake with %s failed", eid)

        fut.add_done_callback(done_callback)

    # TODO: handle failure state of future in the
    # callback, we want to fail the request in this case.
    def request_ready(_f: Future[Any], entry=(req_id, meta)):
        self._ready_requests.put(entry)

    fut.add_done_callback(request_ready)

_get_block_descs_ids ¶

_get_block_descs_ids(
    engine_id: str,
    block_ids: list[int],
    layer_idx: Optional[int] = None,
) -> list[int]

Get the descs ids for a set of block ids. If layer_idx is provided, we use the region_ids for the given layer. Otherwise, we use all regions.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _get_block_descs_ids(self,
                         engine_id: str,
                         block_ids: list[int],
                         layer_idx: Optional[int] = None) -> list[int]:
    """
    Get the descs ids for a set of block ids.
    If layer_idx is provided, we use the region_ids for the given layer.
    Otherwise, we use all regions.
    """
    if layer_idx is None:
        region_ids = range(self.num_regions)
    else:
        assert layer_idx < self.num_layers
        if self.num_layers < self.num_regions:
            # If we have more regions than layers, we assume that
            # the regions are organized as [K0, V0, K1, V1, ...]
            # and we select K_i and V_i
            assert 2 * self.num_layers == self.num_regions
            region_ids = range(2 * layer_idx, 2 * layer_idx + 2)
        else:
            # Otherwise, we assume we have MLA and select i-th layer
            assert self.num_layers == self.num_regions
            region_ids = range(layer_idx, layer_idx + 1)

    num_blocks = self.dst_num_blocks[engine_id]

    # Compute the desc ids for each block.
    descs_ids: list[int] = []
    for reg_id in region_ids:
        for block_id in block_ids:
            descs_ids.append(reg_id * num_blocks + block_id)
    return descs_ids

_get_new_notifs ¶

_get_new_notifs() -> set[str]

Get req_ids which got a remote xfer message. When multiple consumers are reading from the same producer (heterogeneous TP scenario), wait for all consumers to be done pulling.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _get_new_notifs(self) -> set[str]:
    """
    Get req_ids which got a remote xfer message. When multiple consumers
    are reading from the same producer (heterogeneous TP scenario), wait
    for all consumers to be done pulling.
    """
    notified_req_ids: set[str] = set()
    for notifs in self.nixl_wrapper.get_new_notifs().values():
        for notif in notifs:
            req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
            self.consumer_notification_counts_by_req[req_id] += 1
            # Wait all consumers (D) to be done reading before freeing.
            if self.consumer_notification_counts_by_req[req_id] == int(
                    tp_ratio):
                notified_req_ids.add(req_id)
                del self.consumer_notification_counts_by_req[req_id]
    return notified_req_ids

_nixl_handshake ¶

_nixl_handshake(
    host: str, port: int, remote_tp_size: int
) -> dict[int, str]

Do a NIXL handshake with a remote instance.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _nixl_handshake(self, host: str, port: int,
                    remote_tp_size: int) -> dict[int, str]:
    """Do a NIXL handshake with a remote instance."""

    start_time = time.perf_counter()

    # NOTE(rob): we need each rank to have a unique port. This is
    # a hack to keep us moving. We will switch when moving to etcd
    # or where we have a single ZMQ socket in the scheduler.

    def handshake(path: str, rank: int) -> str:
        # Send query for the request.
        with zmq_ctx(zmq.REQ, path) as sock:
            sock.send(GET_META_MSG)
            metadata_bytes = sock.recv()
            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
            metadata = decoder.decode(metadata_bytes)
            got_metadata_time = time.perf_counter()

            # Register Remote agent.
            remote_agent_name = self.add_remote_agent(
                metadata, rank, remote_tp_size)
            setup_agent_time = time.perf_counter()

            logger.debug("NIXL handshake: get metadata took: %s",
                         got_metadata_time - start_time)
            logger.debug("NIXL handshake: add agent took: %s",
                         setup_agent_time - got_metadata_time)
            return remote_agent_name

    # Handshake only with the remote TP rank that current local rank will
    # pull from. With homogeneous TP it happens to be the same rank_i.
    tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
    p_remote_rank = self.tp_rank // tp_ratio
    path = make_zmq_path("tcp", host, port + p_remote_rank)
    logger.debug("Querying metadata on path: %s at remote rank %s", path,
                 p_remote_rank)
    # Remote rank -> agent name.
    return {p_remote_rank: handshake(path, p_remote_rank)}

_nixl_handshake_listener `staticmethod` ¶

_nixl_handshake_listener(
    metadata: NixlAgentMetadata,
    ready_event: Event,
    base_port: int,
    tp_rank: int,
)

Background thread for getting new NIXL handshakes.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

@staticmethod
def _nixl_handshake_listener(metadata: NixlAgentMetadata,
                             ready_event: threading.Event, base_port: int,
                             tp_rank: int):
    """Background thread for getting new NIXL handshakes."""
    # NOTE(rob): this is a simple implementation. We will move
    # to a better approach via HTTP endpoint soon.

    encoder = msgspec.msgpack.Encoder()
    encoded_data = encoder.encode(metadata)
    size_in_bytes = len(encoded_data)
    logger.debug("Size of encoded NixlAgentMetadata: %s bytes",
                 str(size_in_bytes))

    # Listen for new requests for metadata.
    host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
    path = make_zmq_path("tcp", host, base_port + tp_rank)
    logger.debug("Starting listening on path: %s", path)
    with zmq_ctx(zmq.ROUTER, path) as sock:
        ready_event.set()
        while True:
            identity, _, msg = sock.recv_multipart()
            if msg != GET_META_MSG:
                logger.warning(
                    "Connection listener got unexpected message %s", msg)
            sock.send_multipart((identity, b"", encoded_data))

_pop_done_transfers ¶

_pop_done_transfers(
    transfers: dict[str, list[tuple[int, float]]],
) -> set[str]

Pop completed xfers by checking for DONE state. Args: transfers: dict of req_id -> list[running_xfer] Returns: set of req_ids that have all done xfers

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _pop_done_transfers(
        self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
    """
    Pop completed xfers by checking for DONE state.
    Args:
        transfers: dict of req_id -> list[running_xfer]
    Returns:
        set of req_ids that have all done xfers
    """
    done_req_ids: set[str] = set()
    for req_id, handles in list(transfers.items()):
        in_progress = False
        for handle, _xfer_stime in handles:
            xfer_state = self.nixl_wrapper.check_xfer_state(handle)
            if xfer_state == "DONE":
                self.nixl_wrapper.release_xfer_handle(handle)
            elif xfer_state == "PROC":
                in_progress = True
                continue
            else:
                raise RuntimeError("Transfer failed with state %s",
                                   xfer_state)
        if not in_progress:
            done_req_ids.add(req_id)
            del transfers[req_id]
    return done_req_ids

_read_blocks ¶

_read_blocks(
    local_block_ids: list[int],
    remote_block_ids: list[int],
    dst_engine_id: str,
    request_id: str,
)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _read_blocks(self, local_block_ids: list[int],
                 remote_block_ids: list[int], dst_engine_id: str,
                 request_id: str):
    # NOTE(rob): having the staging blocks be on the READER side is
    # not going to work well (since we will have to call rearrange tensors).
    # after we detect the txn is complete (which means we cannot make the
    # read trxn async easily). If we want to make "READ" happen cleanly,
    # then we will need to have the staging blocks on the remote side.

    # NOTE(rob): according to nvidia the staging blocks are used to
    # saturate IB with heterogeneous TP sizes. We should remove the staging
    # blocks until we are ready.

    # Number of D TP workers that will read from dst P. Propagate tp_ratio
    # on notification so that dst worker can wait before freeing blocks.
    tp_ratio = self._tp_size[
        self.engine_id] // self._tp_size[dst_engine_id]
    notif_id = f"{request_id}:{tp_ratio}".encode()

    # Full prefix cache hit: do not need to read remote blocks,
    # just notify P worker that we have the blocks we need.
    num_local_blocks = len(local_block_ids)
    if num_local_blocks == 0:
        remote_rank = self.tp_rank // tp_ratio
        agent_name = self._remote_agents[dst_engine_id][remote_rank]
        self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
        return

    # Partial prefix cache hit: just read uncomputed blocks.
    num_remote_blocks = len(remote_block_ids)
    assert num_local_blocks <= num_remote_blocks
    if num_local_blocks < num_remote_blocks:
        remote_block_ids = remote_block_ids[-num_local_blocks:]

    # Get side handles.
    local_xfer_side_handle = self.src_xfer_side_handle
    remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]

    # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
    # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
    # workers will issue xfers to parts of the P worker remote kv caches.

    # Get descs ids.
    local_block_descs_ids: list[int] = []
    remote_block_descs_ids: list[int] = []
    if not self.block_window_per_layer:
        # Default case: assume global attention
        remote_block_descs_ids = self._get_block_descs_ids(
            dst_engine_id, remote_block_ids)
        local_block_descs_ids = self._get_block_descs_ids(
            self.engine_id, local_block_ids)
    else:
        # TODO(mgoin): remove this once we have hybrid memory allocator
        # Optimization for models with local attention (Llama 4)
        for layer_idx, block_window in enumerate(
                self.block_window_per_layer):
            # For each layer:
            if block_window is None:
                # If not chunked, we just use the
                # full block lists (global attention)
                layer_local_block_ids = local_block_ids
                layer_remote_block_ids = remote_block_ids
            else:
                # If chunked, get the last block_window blocks
                layer_local_block_ids = local_block_ids[-block_window:]
                layer_remote_block_ids = remote_block_ids[-block_window:]

            # Get descs ids for the layer.
            layer_local_desc_ids = self._get_block_descs_ids(
                self.engine_id, layer_local_block_ids, layer_idx)
            layer_remote_desc_ids = self._get_block_descs_ids(
                dst_engine_id, layer_remote_block_ids, layer_idx)

            local_block_descs_ids.extend(layer_local_desc_ids)
            remote_block_descs_ids.extend(layer_remote_desc_ids)

    assert len(local_block_descs_ids) == len(remote_block_descs_ids)

    # Prepare transfer with Nixl.
    handle = self.nixl_wrapper.make_prepped_xfer(
        "READ",
        local_xfer_side_handle,
        local_block_descs_ids,
        remote_xfer_side_handle,
        remote_block_descs_ids,
        notif_msg=notif_id,
    )

    # Begin async xfer.
    self.nixl_wrapper.transfer(handle)

    # Use handle to check completion in future step().
    # TODO (NickLucche) surface xfer elapsed time
    self._recving_transfers[request_id].append(
        (handle, time.perf_counter()))

_read_blocks_for_req ¶

_read_blocks_for_req(req_id: str, meta: ReqMeta)

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
    logger.debug(
        "Remote agent %s available, calling _read_blocks for req %s",
        meta.remote_engine_id, req_id)
    self._read_blocks(
        request_id=req_id,
        dst_engine_id=meta.remote_engine_id,
        local_block_ids=meta.local_block_ids,
        remote_block_ids=meta.remote_block_ids,
    )

add_remote_agent ¶

add_remote_agent(
    nixl_agent_meta: NixlAgentMetadata,
    remote_tp_rank: int = 0,
    remote_tp_size: int = 1,
) -> str

Add the remote NIXL agent and prepare the descriptors for reading cache blocks from remote.

In particular, handle both homogeneous and heterogeneous TP. The former requires local rank_i to read from remote rank_i. The latter, assuming D.world_size > P.world_size, requires that two or more local TP worker share the xfer from a single TP worker.

Here's an example:

rank_offset p_remote_tp_rank (kv split no)

0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
                                                            /
1                 0      Worker1  ---- 2nd half of KV -----/

0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
                                                            /
1                 1      Worker3  ---- 2nd half of KV -----/


                    Decoder TP workers                     Prefix TP workers
                      (world_size=4)                         (world_size=2)
                                     tp_ratio = 4 // 2 = 2

Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format. Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.

Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.

Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0 so that the whole cache is shared by "tp_ratio" D TP workers.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def add_remote_agent(self,
                     nixl_agent_meta: NixlAgentMetadata,
                     remote_tp_rank: int = 0,
                     remote_tp_size: int = 1) -> str:
    """
    Add the remote NIXL agent and prepare the descriptors for reading cache
    blocks from remote.

    In particular, handle both homogeneous and heterogeneous TP. The former
    requires local rank_i to read from remote rank_i. 
    The latter, assuming D.world_size > P.world_size, requires that two or 
    more local TP worker share the xfer from a single TP worker.

    Here's an example:

    rank_offset     p_remote_tp_rank
    (kv split no)    
    --------------------------------
        0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
                                                                    /
        1                 0      Worker1  ---- 2nd half of KV -----/

        0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
                                                                    /
        1                 1      Worker3  ---- 2nd half of KV -----/


                            Decoder TP workers                     Prefix TP workers
                              (world_size=4)                         (world_size=2)
                                             tp_ratio = 4 // 2 = 2                  

    Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
    then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
    Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
    first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
    along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   

    Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.

    Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
    so that the whole cache is shared by "tp_ratio" D TP workers.
    """ # noqa: E501
    engine_id = nixl_agent_meta.engine_id
    # TODO re-evaluate refreshing for scaling/recovery
    if remote_tp_rank in self._remote_agents.get(engine_id, {}):
        return self._remote_agents[engine_id][remote_tp_rank]

    if engine_id in self._tp_size:
        assert self._tp_size[engine_id] == remote_tp_size
    else:
        self._tp_size[engine_id] = remote_tp_size
    # We may eventually enable this after asserting equality in cache
    # layout and close outputs.
    assert nixl_agent_meta.attn_backend_name == self.backend_name

    remote_agent_name = self.nixl_wrapper.add_remote_agent(
        nixl_agent_meta.agent_metadata)

    # Number of D TP workers reading from a single P TP worker. This is
    # 1 when P and D `--tensor-parallel-size` match.
    tp_ratio = divide(self._tp_size[self.engine_id],
                      self._tp_size[engine_id])
    assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"

    # Handle tp_size>num_kv_heads: replicate KV cache.
    total_num_kv_heads = self.model_config.get_total_num_kv_heads()
    is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1

    if self.use_mla or is_kv_replicated:
        # With MLA the only difference is in the number of blocks.
        remote_block_size = nixl_agent_meta.block_len // (
            self.slot_size_bytes)
        assert self.block_len == nixl_agent_meta.block_len
    else:
        remote_block_size = nixl_agent_meta.block_len // (
            self.slot_size_bytes * tp_ratio)
        if self._use_flashinfer:
            # Account for joint KV in FlashInfer.
            remote_block_size //= 2

        assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
            "Remote P worker KV layer cache must be of shape [2, N, "
            "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
        )

    assert self.block_size == remote_block_size, (
        "Remote P worker with different block size is not supported "
        f"{self.block_size=} {remote_block_size=}")

    # Create dst descs and xfer side handles. TP workers have same #blocks.
    if engine_id in self.dst_num_blocks:
        assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
    else:
        self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks

    blocks_data = []
    # With homogeneous TP, D pulls the whole kv cache from corresponding
    # rank. With heterogeneous TP, prepare the descriptors by splitting the
    # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
    # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
    # Only register the remote's descriptors if current rank pulls from it.
    self.kv_caches_base_addr[
        engine_id] = nixl_agent_meta.kv_caches_base_addr
    rank_offset = self.tp_rank % tp_ratio * self.block_len \
        if not (self.use_mla or is_kv_replicated) else 0
    # Register all remote blocks, but only the corresponding kv heads.
    for base_addr in nixl_agent_meta.kv_caches_base_addr:
        for block_id in range(nixl_agent_meta.num_blocks):
            block_offset = block_id * nixl_agent_meta.block_len
            # For each block, grab the heads chunk belonging to rank_i
            # of size remote_nheads // tp_ratio, which correspond to
            # self.block_len == remote_block_len//tp_ratio bytes.
            addr = base_addr + block_offset + rank_offset
            # (addr, len, device id)
            blocks_data.append((addr, self.block_len, remote_tp_rank))
    logger.debug(
        "Created %s blocks for dst engine %s with remote rank %s and "
        "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
        self.tp_rank)

    # Register with NIXL.
    descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
    self.dst_xfer_side_handles[
        engine_id] = self.nixl_wrapper.prep_xfer_dlist(
            remote_agent_name, descs)

    return remote_agent_name

get_finished ¶

get_finished() -> tuple[set[str], set[str]]

Get requests that are done sending or recving.

In TP>1 setup, each rank exchanges KVs with its counterpart ranks independently. get_finished() runs in a worker creates the done_sending and done_recving sets that are sent to the scheduler via ModelRunnerOutput by Rank 0. To ensure trnxs are done before adding to finished, Ranks 1 to N-1 communicate to Rank 0 once their transaction is done + Rank 0 returns finished sets to Scheduler only once all ranks are done.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def get_finished(self) -> tuple[set[str], set[str]]:
    """
    Get requests that are done sending or recving.

    In TP>1 setup, each rank exchanges KVs with its counterpart
    ranks independently. get_finished() runs in a worker creates
    the done_sending and done_recving sets that are sent to the
    scheduler via ModelRunnerOutput by Rank 0. To ensure trnxs
    are done before adding to finished, Ranks 1 to N-1 communicate
    to Rank 0 once their transaction is done + Rank 0 returns
    finished sets to Scheduler only once all ranks are done.
    """
    done_sending = self._get_new_notifs()
    done_recving = self._pop_done_transfers(self._recving_transfers)
    if len(done_sending) > 0 or len(done_recving) > 0:
        logger.debug(
            "Rank %s, get_finished: %s requests done sending "
            "and %s requests done recving", self.tp_rank,
            len(done_sending), len(done_recving))

    if self.world_size == 1:
        return done_sending, done_recving

    # Rank 0: get finished from all other ranks.
    if self.tp_rank == 0:
        for req_id in done_sending:
            self._done_sending_count[req_id] += 1
        for req_id in done_recving:
            self._done_recving_count[req_id] += 1

        # Keep track of how many other ranks have finished.
        other_ranks_finished_ids: list[str] = []
        for i in range(1, self.world_size):
            other_ranks_finished_ids.extend(
                self.tp_group.recv_object(src=i))
        for req_id in other_ranks_finished_ids:
            if (req_id in self._done_recving_count
                    or req_id in self._recving_transfers):
                self._done_recving_count[req_id] += 1
            else:
                self._done_sending_count[req_id] += 1

        # Return ids that finished on all ranks to the scheduler.
        all_done_recving: set[str] = set()
        for req_id in list(self._done_recving_count.keys()):
            if self._done_recving_count[req_id] == self.world_size:
                del self._done_recving_count[req_id]
                all_done_recving.add(req_id)

        all_done_sending: set[str] = set()
        for req_id in list(self._done_sending_count.keys()):
            if self._done_sending_count[req_id] == self.world_size:
                del self._done_sending_count[req_id]
                all_done_sending.add(req_id)

        return all_done_sending, all_done_recving

    # Ranks 1 to N-1: send finished ids to Rank 0.
    else:
        finished_req_ids = list(done_recving.union(done_sending))
        self.tp_group.send_object(finished_req_ids, dst=0)

        # Unused as only Rank 0 results are sent to scheduler.
        return done_sending, done_recving

register_kv_caches ¶

register_kv_caches(kv_caches: dict[str, Tensor])

Register the KV Cache data in nixl.

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
    """Register the KV Cache data in nixl."""

    _, first_kv_cache = next(iter(kv_caches.items()))
    kv_elem_size = first_kv_cache.element_size()

    # TODO(tms): Find a more robust way to detect and handle MLA
    # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
    # KV memory layout is HND, as opposed to the default NHD. Note that it
    # will only affects the strides. For MLA instead, we make require no
    # such thing and resort to the standard layout.
    use_mla = len(first_kv_cache.shape) == 3
    assert use_mla == self.use_mla

    # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
    # once it goes live, as a single kv layout is expected for xfers.
    if use_mla:
        # MLA case.
        self.num_blocks = first_kv_cache.shape[0]
        block_rank = 2  # [block_size, latent_dim]
        block_shape = first_kv_cache.shape[-block_rank:]
        block_size, kv_latent_dim = block_shape
        self.slot_size_bytes = kv_elem_size * kv_latent_dim
    else:
        # [2 (k and v), num_blocks, ...]
        if self._use_flashinfer:
            # FlashInfer swaps 2<->num_blocks dimensions.
            self.num_blocks = first_kv_cache.shape[0]
            block_rank = 4  # [2, block_size, kv_heads, head_dim]
        else:
            self.num_blocks = first_kv_cache.shape[1]
            block_rank = 3  # [block_size, kv_heads, head_dim]
        block_shape = first_kv_cache.shape[-block_rank:]
        block_size, n_kv_heads, head_dim = block_shape[-3:]
        # head size in bytes.
        self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
    assert block_size == self.block_size
    # TODO(tms): self.block_len needs to be per-layer for sliding window,
    # hybrid attn, etc
    # block size in bytes
    self.block_len = kv_elem_size * math.prod(block_shape)
    logger.info(
        "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
        "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
        self.num_blocks, block_shape, first_kv_cache.shape)
    self.dst_num_blocks[self.engine_id] = self.num_blocks
    self.kv_caches = kv_caches
    kv_caches_base_addr = []
    caches_data = []

    # Note(tms): I modified this from the original region setup code.
    # K and V are now in different regions. Advantage is that we can
    # elegantly support MLA and any cases where the K and V tensors
    # are non-contiguous (it's not locally guaranteed that they will be)
    # Disadvantage is that the encoded NixlAgentMetadata is now larger
    # (roughly 8KB vs 5KB).
    # Conversely for FlashInfer, K and V are transferred in the same tensor
    # to better exploit the memory layout (ie num_blocks is the first dim).
    for cache_or_caches in kv_caches.values():
        # Normalize to always be a list of caches
        cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
            else cache_or_caches
        for cache in cache_list:
            base_addr = cache.data_ptr()
            region_len = self.num_blocks * self.block_len
            caches_data.append(
                (base_addr, region_len, cache.device.index, ""))
            kv_caches_base_addr.append(base_addr)
    self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
    self.num_regions = len(caches_data)
    self.num_layers = len(self.kv_caches.keys())

    # TODO(mgoin): remove this once we have hybrid memory allocator
    # Optimization for models with local attention (Llama 4)
    if self.vllm_config.model_config.hf_config.model_type == "llama4":
        from transformers import Llama4TextConfig
        assert isinstance(self.vllm_config.model_config.hf_text_config,
                          Llama4TextConfig)
        llama4_config = self.vllm_config.model_config.hf_text_config
        no_rope_layers = llama4_config.no_rope_layers
        chunk_size = llama4_config.attention_chunk_size
        chunk_block_size = math.ceil(chunk_size / self.block_size)
        for layer_idx in range(self.num_layers):
            # no_rope_layers[layer_idx] == 0 means NoPE (global)
            # Any other value means RoPE (local chunked)
            is_local_attention = no_rope_layers[layer_idx] != 0
            block_window = chunk_block_size if is_local_attention else None
            self.block_window_per_layer.append(block_window)
        logger.debug("Llama 4 block window per layer mapping: %s",
                     self.block_window_per_layer)
        assert len(self.block_window_per_layer) == self.num_layers

    descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM")
    logger.debug("Registering descs: %s", caches_data)
    self.nixl_wrapper.register_memory(descs)
    logger.debug("Done registering descs")
    self._registered_descs.append(descs)

    # Register local/src descr for NIXL xfer.
    blocks_data = []
    for base_addr in self.kv_caches_base_addr[self.engine_id]:
        # NOTE With heter-TP, more blocks are prepared than what are
        # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
        # could create fewer, but then _get_block_descs_ids needs to
        # select agent_meta.num_blocks instead of self.num_blocks for
        # local descr, and that makes handling regular flow less clean.
        for block_id in range(self.num_blocks):
            block_offset = block_id * self.block_len
            addr = base_addr + block_offset
            # (addr, len, device id)
            blocks_data.append((addr, self.block_len, self.tp_rank))
    logger.debug("Created %s blocks for src engine %s and rank %s",
                 len(blocks_data), self.engine_id, self.tp_rank)

    descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
    # NIXL_INIT_AGENT to be used for preparations of local descs.
    self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
        "NIXL_INIT_AGENT", descs)

    # After KV Caches registered, listen for new connections.
    metadata = NixlAgentMetadata(
        engine_id=self.engine_id,
        agent_metadata=self.nixl_wrapper.get_agent_metadata(),
        kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
        num_blocks=self.num_blocks,
        block_len=self.block_len,
        attn_backend_name=self.backend_name)
    ready_event = threading.Event()
    self._nixl_handshake_listener_t = threading.Thread(
        target=self._nixl_handshake_listener,
        args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
        daemon=True,
        name="nixl_handshake_listener")
    self._nixl_handshake_listener_t.start()
    ready_event.wait()  # Wait for listener ZMQ socket to be ready.

start_load_kv ¶

start_load_kv(metadata: NixlConnectorMetadata)

Start loading by triggering non-blocking nixl_xfer. We check for these trnxs to complete in each step().

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

def start_load_kv(self, metadata: NixlConnectorMetadata):
    """
    Start loading by triggering non-blocking nixl_xfer.
    We check for these trnxs to complete in each step().
    """
    for req_id, meta in metadata.requests.items():
        remote_engine_id = meta.remote_engine_id
        logger.debug(
            "start_load_kv for request %s from remote engine %s. "
            "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
            remote_engine_id, len(meta.local_block_ids),
            len(meta.remote_block_ids))
        if remote_engine_id not in self._remote_agents:
            # Initiate handshake with remote engine to exchange metadata.
            with self._handshake_lock:
                if remote_engine_id not in self._remote_agents:
                    self._background_nixl_handshake(
                        req_id, remote_engine_id, meta)
                    continue

        # Handshake already completed, start async read xfer.
        self._read_blocks_for_req(req_id, meta)

    # Start transfers for requests whose handshakes have now finished.
    while not self._ready_requests.empty():
        self._read_blocks_for_req(*self._ready_requests.get_nowait())

ReqMeta `dataclass` ¶

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

@dataclass
class ReqMeta:
    local_block_ids: list[int]
    remote_block_ids: list[int]
    remote_host: str
    remote_port: int
    remote_engine_id: str
    tp_size: int

local_block_ids `instance-attribute` ¶

local_block_ids: list[int]

remote_block_ids `instance-attribute` ¶

remote_block_ids: list[int]

remote_engine_id `instance-attribute` ¶

remote_engine_id: str

remote_host `instance-attribute` ¶

remote_host: str

remote_port `instance-attribute` ¶

remote_port: int

tp_size `instance-attribute` ¶

tp_size: int

init ¶

__init__(
    local_block_ids: list[int],
    remote_block_ids: list[int],
    remote_host: str,
    remote_port: int,
    remote_engine_id: str,
    tp_size: int,
) -> None

zmq_ctx ¶

zmq_ctx(socket_type: Any, addr: str) -> Iterator[Socket]

Context manager for a ZMQ socket

Source code in vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

@contextlib.contextmanager
def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
    """Context manager for a ZMQ socket"""

    if socket_type not in (zmq.ROUTER, zmq.REQ):
        raise ValueError(f"Unexpected socket type: {socket_type}")

    ctx: Optional[zmq.Context] = None
    try:
        ctx = zmq.Context()  # type: ignore[attr-defined]
        yield make_zmq_socket(ctx=ctx,
                              path=addr,
                              socket_type=socket_type,
                              bind=socket_type == zmq.ROUTER)
    finally:
        if ctx is not None:
            ctx.destroy(linger=0)

vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector

EngineId module-attribute ¶

GET_META_MSG module-attribute ¶

ReqId module-attribute ¶

Transfer module-attribute ¶

logger module-attribute ¶

NixlAgentMetadata ¶

agent_metadata instance-attribute ¶

attn_backend_name instance-attribute ¶

block_len instance-attribute ¶

engine_id instance-attribute ¶

kv_caches_base_addr instance-attribute ¶

num_blocks instance-attribute ¶

NixlConnector ¶

connector_scheduler instance-attribute ¶

connector_worker instance-attribute ¶

engine_id instance-attribute ¶

__init__ ¶

build_connector_meta ¶

get_finished ¶

get_num_new_matched_tokens ¶

register_kv_caches ¶

request_finished ¶

save_kv_layer ¶

start_load_kv ¶

update_state_after_alloc ¶

wait_for_layer_load ¶

wait_for_save ¶

NixlConnectorMetadata ¶

requests instance-attribute ¶

__init__ ¶

add_new_req ¶

NixlConnectorScheduler ¶

_reqs_need_recv instance-attribute ¶

block_size instance-attribute ¶

engine_id instance-attribute ¶

side_channel_host instance-attribute ¶

side_channel_port instance-attribute ¶

vllm_config instance-attribute ¶

__init__ ¶

build_connector_meta ¶

get_num_new_matched_tokens ¶

request_finished ¶

update_state_after_alloc ¶

NixlConnectorWorker ¶

_done_recving_count instance-attribute ¶

_done_sending_count instance-attribute ¶

_handshake_futures instance-attribute ¶

_handshake_initiation_executor instance-attribute ¶

_handshake_lock instance-attribute ¶

_nixl_handshake_listener_t instance-attribute ¶

_ready_requests instance-attribute ¶

_recving_transfers instance-attribute ¶

_registered_descs instance-attribute ¶

_remote_agents instance-attribute ¶

_tp_size instance-attribute ¶

_use_flashinfer instance-attribute ¶

backend_name instance-attribute ¶

block_size instance-attribute ¶

block_window_per_layer instance-attribute ¶

cache_config instance-attribute ¶

consumer_notification_counts_by_req instance-attribute ¶

dst_num_blocks instance-attribute ¶

dst_xfer_side_handles instance-attribute ¶

engine_id instance-attribute ¶

kv_caches instance-attribute ¶

kv_caches_base_addr instance-attribute ¶

model_config instance-attribute ¶

nixl_wrapper instance-attribute ¶

num_layers instance-attribute ¶

num_regions instance-attribute ¶

side_channel_port instance-attribute ¶

src_xfer_side_handle instance-attribute ¶

tp_group instance-attribute ¶

tp_rank instance-attribute ¶

use_mla instance-attribute ¶

vllm_config instance-attribute ¶

world_size instance-attribute ¶

__del__ ¶

__init__ ¶

EngineId `module-attribute` ¶

GET_META_MSG `module-attribute` ¶

ReqId `module-attribute` ¶

Transfer `module-attribute` ¶

logger `module-attribute` ¶

agent_metadata `instance-attribute` ¶

attn_backend_name `instance-attribute` ¶

block_len `instance-attribute` ¶

engine_id `instance-attribute` ¶

kv_caches_base_addr `instance-attribute` ¶

num_blocks `instance-attribute` ¶

connector_scheduler `instance-attribute` ¶

connector_worker `instance-attribute` ¶

engine_id `instance-attribute` ¶

init ¶

requests `instance-attribute` ¶

init ¶

_reqs_need_recv `instance-attribute` ¶

block_size `instance-attribute` ¶

engine_id `instance-attribute` ¶

side_channel_host `instance-attribute` ¶

side_channel_port `instance-attribute` ¶

vllm_config `instance-attribute` ¶

init ¶

_done_recving_count `instance-attribute` ¶

_done_sending_count `instance-attribute` ¶

_handshake_futures `instance-attribute` ¶

_handshake_initiation_executor `instance-attribute` ¶

_handshake_lock `instance-attribute` ¶

_nixl_handshake_listener_t `instance-attribute` ¶

_ready_requests `instance-attribute` ¶

_recving_transfers `instance-attribute` ¶

_registered_descs `instance-attribute` ¶

_remote_agents `instance-attribute` ¶

_tp_size `instance-attribute` ¶

_use_flashinfer `instance-attribute` ¶

backend_name `instance-attribute` ¶

block_size `instance-attribute` ¶

block_window_per_layer `instance-attribute` ¶

cache_config `instance-attribute` ¶

consumer_notification_counts_by_req `instance-attribute` ¶

dst_num_blocks `instance-attribute` ¶

dst_xfer_side_handles `instance-attribute` ¶

engine_id `instance-attribute` ¶

kv_caches `instance-attribute` ¶

kv_caches_base_addr `instance-attribute` ¶

model_config `instance-attribute` ¶

nixl_wrapper `instance-attribute` ¶

num_layers `instance-attribute` ¶

num_regions `instance-attribute` ¶

side_channel_port `instance-attribute` ¶

src_xfer_side_handle `instance-attribute` ¶

tp_group `instance-attribute` ¶

tp_rank `instance-attribute` ¶

use_mla `instance-attribute` ¶

vllm_config `instance-attribute` ¶

world_size `instance-attribute` ¶

del ¶

init ¶

_nixl_handshake_listener `staticmethod` ¶

ReqMeta `dataclass` ¶

local_block_ids `instance-attribute` ¶

remote_block_ids `instance-attribute` ¶

remote_engine_id `instance-attribute` ¶

remote_host `instance-attribute` ¶

remote_port `instance-attribute` ¶

tp_size `instance-attribute` ¶

init ¶