vllm.worker.cpu_worker

A CPU worker class.

logger `module-attribute` ¶

logger = init_logger(__name__)

CPUCacheEngine ¶

Manages the KV cache for CPU backend.

This class is responsible for initializing and managing CPU KV caches. It also provides methods for performing KV cache operations, such as copying.

Source code in vllm/worker/cpu_worker.py

class CPUCacheEngine:
    """Manages the KV cache for CPU backend.

    This class is responsible for initializing and managing CPU KV
    caches. It also provides methods for performing KV cache operations, such
    as copying.
    """

    def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
                 parallel_config: ParallelConfig,
                 device_config: DeviceConfig) -> None:
        assert device_config.device_type == "cpu"
        self.cache_config = cache_config
        self.model_config = model_config
        self.parallel_config = parallel_config

        self.head_size = model_config.get_head_size()
        self.num_layers = model_config.get_num_layers(parallel_config)
        self.num_heads = model_config.get_num_kv_heads(parallel_config)

        self.block_size = cache_config.block_size
        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
        # for CPU backend, because we want to reuse KV cache management
        # in the scheduler.
        self.num_cpu_blocks = cache_config.num_gpu_blocks

        self.dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config,
                                                       model_config)

        # Get attention backend.
        self.attn_backend = get_attn_backend(
            self.model_config.get_head_size(),
            self.model_config.dtype,
            cache_config.cache_dtype,
            self.block_size,
            self.model_config.is_attention_free,
            use_mla=self.model_config.use_mla,
        )

        # Initialize the cache.
        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks)

    def _allocate_kv_cache(
        self,
        num_blocks: int,
    ) -> List[torch.Tensor]:
        """Allocates KV cache on CPU."""
        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
            num_blocks, self.block_size, self.num_heads, self.head_size)
        kv_cache: List[torch.Tensor] = []
        for _ in range(self.num_layers):
            kv_cache.append(
                torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu"))
        return kv_cache

    def swap_in(self, src_to_dst: torch.Tensor) -> None:
        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")

    def swap_out(self, src_to_dst: torch.Tensor) -> None:
        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")

    def copy(self, src_to_dsts: torch.Tensor) -> None:
        self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts)

    @staticmethod
    def get_kv_cache_dtype(cache_config: CacheConfig,
                           model_config: ModelConfig):
        if cache_config.cache_dtype == "auto":
            return model_config.dtype
        elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
            return torch.float8_e5m2
        else:
            raise NotImplementedError(f"Unsupported KV cache type "
                                      f"{cache_config.cache_dtype}.")

    @staticmethod
    def get_cache_block_size(
        cache_config: CacheConfig,
        model_config: ModelConfig,
        parallel_config: ParallelConfig,
    ) -> int:
        head_size = model_config.get_head_size()
        num_heads = model_config.get_num_kv_heads(parallel_config)
        num_layers = model_config.get_num_layers(parallel_config)

        key_cache_block = cache_config.block_size * num_heads * head_size
        value_cache_block = key_cache_block if not model_config.use_mla else 0
        total = num_layers * (key_cache_block + value_cache_block)
        dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, model_config)
        dtype_size = torch.tensor([], dtype=dtype).element_size()
        return dtype_size * total

attn_backend `instance-attribute` ¶

attn_backend = get_attn_backend(
    get_head_size(),
    dtype,
    cache_dtype,
    block_size,
    is_attention_free,
    use_mla=use_mla,
)

block_size `instance-attribute` ¶

block_size = block_size

cache_config `instance-attribute` ¶

cache_config = cache_config

cpu_cache `instance-attribute` ¶

cpu_cache = _allocate_kv_cache(num_cpu_blocks)

dtype `instance-attribute` ¶

dtype = get_kv_cache_dtype(cache_config, model_config)

head_size `instance-attribute` ¶

head_size = get_head_size()

model_config `instance-attribute` ¶

model_config = model_config

num_cpu_blocks `instance-attribute` ¶

num_cpu_blocks = num_gpu_blocks

num_heads `instance-attribute` ¶

num_heads = get_num_kv_heads(parallel_config)

num_layers `instance-attribute` ¶

num_layers = get_num_layers(parallel_config)

parallel_config `instance-attribute` ¶

parallel_config = parallel_config

init ¶

__init__(
    cache_config: CacheConfig,
    model_config: ModelConfig,
    parallel_config: ParallelConfig,
    device_config: DeviceConfig,
) -> None

Source code in vllm/worker/cpu_worker.py

def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
             parallel_config: ParallelConfig,
             device_config: DeviceConfig) -> None:
    assert device_config.device_type == "cpu"
    self.cache_config = cache_config
    self.model_config = model_config
    self.parallel_config = parallel_config

    self.head_size = model_config.get_head_size()
    self.num_layers = model_config.get_num_layers(parallel_config)
    self.num_heads = model_config.get_num_kv_heads(parallel_config)

    self.block_size = cache_config.block_size
    # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
    # for CPU backend, because we want to reuse KV cache management
    # in the scheduler.
    self.num_cpu_blocks = cache_config.num_gpu_blocks

    self.dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config,
                                                   model_config)

    # Get attention backend.
    self.attn_backend = get_attn_backend(
        self.model_config.get_head_size(),
        self.model_config.dtype,
        cache_config.cache_dtype,
        self.block_size,
        self.model_config.is_attention_free,
        use_mla=self.model_config.use_mla,
    )

    # Initialize the cache.
    self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks)

_allocate_kv_cache ¶

_allocate_kv_cache(num_blocks: int) -> List[Tensor]

Allocates KV cache on CPU.

Source code in vllm/worker/cpu_worker.py

def _allocate_kv_cache(
    self,
    num_blocks: int,
) -> List[torch.Tensor]:
    """Allocates KV cache on CPU."""
    kv_cache_shape = self.attn_backend.get_kv_cache_shape(
        num_blocks, self.block_size, self.num_heads, self.head_size)
    kv_cache: List[torch.Tensor] = []
    for _ in range(self.num_layers):
        kv_cache.append(
            torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu"))
    return kv_cache

copy ¶

copy(src_to_dsts: Tensor) -> None

Source code in vllm/worker/cpu_worker.py

def copy(self, src_to_dsts: torch.Tensor) -> None:
    self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts)

get_cache_block_size `staticmethod` ¶

get_cache_block_size(
    cache_config: CacheConfig,
    model_config: ModelConfig,
    parallel_config: ParallelConfig,
) -> int

Source code in vllm/worker/cpu_worker.py

@staticmethod
def get_cache_block_size(
    cache_config: CacheConfig,
    model_config: ModelConfig,
    parallel_config: ParallelConfig,
) -> int:
    head_size = model_config.get_head_size()
    num_heads = model_config.get_num_kv_heads(parallel_config)
    num_layers = model_config.get_num_layers(parallel_config)

    key_cache_block = cache_config.block_size * num_heads * head_size
    value_cache_block = key_cache_block if not model_config.use_mla else 0
    total = num_layers * (key_cache_block + value_cache_block)
    dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, model_config)
    dtype_size = torch.tensor([], dtype=dtype).element_size()
    return dtype_size * total

get_kv_cache_dtype `staticmethod` ¶

get_kv_cache_dtype(
    cache_config: CacheConfig, model_config: ModelConfig
)

Source code in vllm/worker/cpu_worker.py

@staticmethod
def get_kv_cache_dtype(cache_config: CacheConfig,
                       model_config: ModelConfig):
    if cache_config.cache_dtype == "auto":
        return model_config.dtype
    elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
        return torch.float8_e5m2
    else:
        raise NotImplementedError(f"Unsupported KV cache type "
                                  f"{cache_config.cache_dtype}.")

swap_in ¶

swap_in(src_to_dst: Tensor) -> None

Source code in vllm/worker/cpu_worker.py

def swap_in(self, src_to_dst: torch.Tensor) -> None:
    raise NotImplementedError("Swap is not supported in CPUCacheEngine.")

swap_out ¶

swap_out(src_to_dst: Tensor) -> None

Source code in vllm/worker/cpu_worker.py

def swap_out(self, src_to_dst: torch.Tensor) -> None:
    raise NotImplementedError("Swap is not supported in CPUCacheEngine.")

CPUWorker ¶

Bases: LocalOrDistributedWorkerBase

A worker class that executes (a partition of) the model on a CPU socket.

Each worker is associated with a single CPU socket. The worker is responsible for maintaining the KV cache and executing the model on the CPU. In case of distributed inference, each worker is assigned a partition of the model.

Source code in vllm/worker/cpu_worker.py

class CPUWorker(LocalOrDistributedWorkerBase):
    """A worker class that executes (a partition of) the model on a CPU socket.

    Each worker is associated with a single CPU socket. The worker is 
    responsible for maintaining the KV cache and executing the model on the 
    CPU. In case of distributed inference, each worker is assigned a partition
    of the model.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
        local_rank: int,
        rank: int,
        distributed_init_method: str,
        kv_cache_dtype: Optional[str] = "auto",
        is_driver_worker: bool = False,
        model_runner_cls: Optional[Type[CPUModelRunner]] = None,
    ) -> None:
        WorkerBase.__init__(self, vllm_config=vllm_config)

        self.local_rank = local_rank
        self.rank = rank
        vllm_config.parallel_config.rank = rank

        self.distributed_init_method = distributed_init_method

        self.is_driver_worker = is_driver_worker
        if self.is_driver_worker:
            assert self.rank == 0, "The driver worker must have rank 0."

        if self.model_config.trust_remote_code:
            # note: lazy import to avoid importing torch before initializing
            from vllm.utils import init_cached_hf_modules
            init_cached_hf_modules()

        # Setup OpenMP threads affinity.
        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
        self.local_omp_cpuid = "all"
        if omp_cpuids == "auto":
            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
            )
        else:
            self.local_omp_cpuid = omp_cpuids.split("|")[rank]

        # Return hidden states from target model if the draft model is an
        # mlp_speculator
        speculative_config = self.speculative_config
        model_config = self.model_config
        speculative_args = {} if speculative_config is None \
            or (speculative_config.draft_model_config.model ==
                model_config.model) \
            or (speculative_config.draft_model_config.hf_config.model_type
                not in ["medusa", "mlp_speculator", "eagle"]) \
                    else {"return_hidden_states": True}
        ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
        if self.model_config.runner_type == "pooling":
            ModelRunnerClass = CPUPoolingModelRunner
        elif self.model_config.is_encoder_decoder:
            ModelRunnerClass = CPUEncoderDecoderModelRunner
        self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
            vllm_config=vllm_config,
            kv_cache_dtype=kv_cache_dtype,
            is_driver_worker=is_driver_worker,
            **speculative_args,
        )
        if model_runner_cls is not None:
            self.model_runner = model_runner_cls(self.model_runner)
        # Uninitialized cache engine. Will be initialized by
        # initialize_cache.
        self.cache_engine: List[CPUCacheEngine]
        # Initialize cpu_cache as pooling models don't initialize kv_caches
        self.cpu_cache: Optional[List[List[torch.Tensor]]] = None

        # Torch profiler. Enabled and configured through env vars:
        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
        if envs.VLLM_TORCH_PROFILER_DIR:
            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
            logger.info("Profiling enabled. Traces will be saved to: %s",
                        torch_profiler_trace_dir)
            self.profiler = torch.profiler.profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                ],
                with_stack=True,
                on_trace_ready=torch.profiler.tensorboard_trace_handler(
                    torch_profiler_trace_dir, use_gzip=True))
        else:
            self.profiler = None

    def start_profile(self):
        if self.profiler is None:
            raise RuntimeError("Profiler is not enabled.")
        self.profiler.start()

    def stop_profile(self):
        if self.profiler is None:
            raise RuntimeError("Profiler is not enabled.")
        self.profiler.stop()

    def init_device(self) -> None:
        if self.local_omp_cpuid != "all":
            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
            if ret:
                logger.info(ret)

        # Note: unique identifier for creating allreduce shared memory
        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
            ":")[-1]
        self.device = torch.device("cpu")
        self.init_distributed_environment()
        # Set random seed.
        set_random_seed(self.model_config.seed)

    def load_model(self):
        self.model_runner.load_model()

    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Determine the number of blocks available for the KV cache.

        This determines how many KV blocks can fit into the configured CPU
        KV cache space.

        Note that since vLLM assumes a block resides on GPU if it can be
        modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
        This allows us to reuse the scheduler of vLLM without generalizing it
        to different devices.
        """
        # For CPU device, the block number will be calculated based on the
        # cpu_kvcache_space.
        cache_block_size = self.get_cache_block_size_bytes()
        num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes //
                             cache_block_size)
        num_cpu_blocks = max(num_cpu_blocks, 0)

        # Note: To reuse the cache management procedure,
        # use cpu cache as 'gpu cache'.
        num_gpu_blocks = num_cpu_blocks
        num_cpu_blocks = 0
        return num_gpu_blocks, num_cpu_blocks

    def initialize_cache(self, num_gpu_blocks: int,
                         num_cpu_blocks: int) -> None:
        """Initialize the KV cache. Currently, swappable CPU memory is not
        supported.

        Since this worker does not support GPUs, we use the num_gpu_blocks to
        determine how many non-swappable CPU blocks to allocate.
        """
        assert (num_cpu_blocks == 0
                ), f"{type(self)} does not support swappable cache"

        # Note: To reuse the cache management procedure,
        # use cpu cache as 'gpu cache'.
        num_cpu_blocks = num_gpu_blocks

        self._validate_num_cpu_blocks(num_cpu_blocks)
        self.cache_config.num_gpu_blocks = num_cpu_blocks
        self.cache_config.num_cpu_blocks = 0

        # Initialize the cache.
        self._init_cache_engine()

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.model_runner.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.model_runner.remove_lora(lora_id)

    def pin_lora(self, lora_id: int) -> bool:
        return self.model_runner.pin_lora(lora_id)

    def list_loras(self) -> Set[int]:
        return self.model_runner.list_loras()

    def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
        """Raise errors if the num_cpu_blocks is invalid.
        """
        if num_cpu_blocks <= 0:
            raise ValueError("No available memory for the cache blocks. "
                             "Try increasing `VLLM_CPU_KVCACHE_SPACE` when "
                             "initializing the engine.")

        max_seq_len = self.cache_config.block_size * num_cpu_blocks
        if self.model_config.max_model_len > max_seq_len:
            raise ValueError(
                f"The model's max seq len ({self.model_config.max_model_len}) "
                "is larger than the maximum number of tokens that can be "
                f"stored in KV cache ({max_seq_len}). Try increasing "
                "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when "
                "initializing the engine.")

    def _init_cache_engine(self) -> None:
        self.cache_engine = [
            CPUCacheEngine(self.cache_config, self.model_config,
                           self.parallel_config, self.device_config)
            for _ in range(self.parallel_config.pipeline_parallel_size)
        ]
        self.cpu_cache = [
            self.cache_engine[ve].cpu_cache
            for ve in range(self.parallel_config.pipeline_parallel_size)
        ]
        bind_kv_cache(self.compilation_config.static_forward_context,
                      self.cpu_cache)
        self.model_runner.block_size = self.cache_engine[0].block_size

        assert all(
            self.cpu_cache[ve] is not None
            for ve in range(self.parallel_config.pipeline_parallel_size))

        # Populate the cache to warmup the memory
        for ve in range(self.parallel_config.pipeline_parallel_size):
            for layer_cache in self.cpu_cache[ve]:
                layer_cache.fill_(0)

    @property
    def do_metadata_broadcast(self) -> bool:
        return self.parallel_config.tensor_parallel_size > 1

    @property
    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
        return self.cpu_cache

    @property
    def vocab_size(self) -> int:
        return self.model_runner.vocab_size

    @property
    def max_model_len(self) -> int:
        return self.model_config.max_model_len

    def execute_worker(
        self,
        worker_input: WorkerInput,
    ) -> None:
        if (worker_input.blocks_to_copy is not None
                and worker_input.blocks_to_copy.numel() > 0):
            self.cache_engine[worker_input.virtual_engine].copy(
                worker_input.blocks_to_copy)

    @torch.inference_mode()
    def prepare_worker_input(
            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
        assert execute_model_req is not None
        virtual_engine: int = execute_model_req.virtual_engine
        num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                      device="cpu",
                                      dtype=torch.int64).view(-1, 2)
        assert len(execute_model_req.blocks_to_swap_in) == 0
        assert len(execute_model_req.blocks_to_swap_out) == 0
        return WorkerInput(
            num_seq_groups=num_seq_groups,
            blocks_to_copy=blocks_to_copy,
            virtual_engine=virtual_engine,
        )

    def init_distributed_environment(self) -> None:
        """Initialize the distributed environment."""

        parallel_config = self.parallel_config
        rank = self.rank
        distributed_init_method = self.distributed_init_method
        init_distributed_environment(
            world_size=parallel_config.world_size,
            rank=rank,
            distributed_init_method=distributed_init_method,
            backend="gloo",
        )

        # A small all_reduce for warmup.
        torch.distributed.all_reduce(torch.zeros(1).cpu())

        ensure_model_parallel_initialized(
            parallel_config.tensor_parallel_size,
            parallel_config.pipeline_parallel_size)

    def get_cache_block_size_bytes(self) -> int:
        """Return the size in bytes of a single KV cache block.
        """
        return CPUCacheEngine.get_cache_block_size(self.cache_config,
                                                   self.model_config,
                                                   self.parallel_config)

    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
        """Return CPUs id binding based on NUMA nodes.
        """
        rank_to_cpus = self.local_omp_cpuid
        # Setup OpenMP thread affinity based on NUMA nodes automatically
        world_size = self.vllm_config.parallel_config.world_size
        libnuma_found = util.find_spec("numa") is not None
        psutil_found = util.find_spec("psutil") is not None
        if libnuma_found and psutil_found:
            import psutil
            from numa import info
            cpu_count = psutil.cpu_count(logical=False)
            cpus_allow_list = psutil.Process().cpu_affinity()
            numa_size = info.get_num_configured_nodes()
            cpu_count_per_numa = cpu_count // numa_size
            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
                                      cpu_count_per_numa // 2)

            # check allow node_to_cpus list
            node_to_cpus = []
            for i in range(numa_size):
                node_intersect = set(
                    info.node_to_cpus(i)).intersection(cpus_allow_list)
                if bool(node_intersect):
                    node_to_cpus.append(list(node_intersect))

            if world_size > len(node_to_cpus):
                logger.error(
                    "Auto thread-binding failed due to "
                    "world size: %d is larger than "
                    "allowed NUMA nodes number: %d."
                    "Please try to bind threads manually.", world_size,
                    len(node_to_cpus))
            else:
                end = cpu_count_per_numa - num_of_reserved_cpu
                rank_to_cpus_list = node_to_cpus[self.rank][:end]
                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
                logger.info("auto thread-binding list: %s", rank_to_cpus)
        else:
            logger.warning(
                "Auto thread-binding is not supported due to "
                "the lack of package numa and psutil,"
                "fallback to no thread-binding. To get better performance,"
                "please try to manually bind threads.")
        return rank_to_cpus

cache_engine `instance-attribute` ¶

cache_engine: List[CPUCacheEngine]

cpu_cache `instance-attribute` ¶

cpu_cache: Optional[List[List[Tensor]]] = None

distributed_init_method `instance-attribute` ¶

distributed_init_method = distributed_init_method

do_metadata_broadcast `property` ¶

do_metadata_broadcast: bool

is_driver_worker `instance-attribute` ¶

is_driver_worker = is_driver_worker

kv_cache `property` ¶

kv_cache: Optional[List[List[Tensor]]]

local_omp_cpuid `instance-attribute` ¶

local_omp_cpuid = 'all'

local_rank `instance-attribute` ¶

local_rank = local_rank

max_model_len `property` ¶

max_model_len: int

model_runner `instance-attribute` ¶

model_runner: CPUModelRunnerBase = ModelRunnerClass(
    vllm_config=vllm_config,
    kv_cache_dtype=kv_cache_dtype,
    is_driver_worker=is_driver_worker,
    **speculative_args,
)

profiler `instance-attribute` ¶

profiler = profile(
    activities=[CPU],
    with_stack=True,
    on_trace_ready=tensorboard_trace_handler(
        torch_profiler_trace_dir, use_gzip=True
    ),
)

rank `instance-attribute` ¶

rank = rank

vocab_size `property` ¶

vocab_size: int

init ¶

__init__(
    vllm_config: VllmConfig,
    local_rank: int,
    rank: int,
    distributed_init_method: str,
    kv_cache_dtype: Optional[str] = "auto",
    is_driver_worker: bool = False,
    model_runner_cls: Optional[Type[CPUModelRunner]] = None,
) -> None

Source code in vllm/worker/cpu_worker.py

def __init__(
    self,
    vllm_config: VllmConfig,
    local_rank: int,
    rank: int,
    distributed_init_method: str,
    kv_cache_dtype: Optional[str] = "auto",
    is_driver_worker: bool = False,
    model_runner_cls: Optional[Type[CPUModelRunner]] = None,
) -> None:
    WorkerBase.__init__(self, vllm_config=vllm_config)

    self.local_rank = local_rank
    self.rank = rank
    vllm_config.parallel_config.rank = rank

    self.distributed_init_method = distributed_init_method

    self.is_driver_worker = is_driver_worker
    if self.is_driver_worker:
        assert self.rank == 0, "The driver worker must have rank 0."

    if self.model_config.trust_remote_code:
        # note: lazy import to avoid importing torch before initializing
        from vllm.utils import init_cached_hf_modules
        init_cached_hf_modules()

    # Setup OpenMP threads affinity.
    omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
    self.local_omp_cpuid = "all"
    if omp_cpuids == "auto":
        self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
        )
    else:
        self.local_omp_cpuid = omp_cpuids.split("|")[rank]

    # Return hidden states from target model if the draft model is an
    # mlp_speculator
    speculative_config = self.speculative_config
    model_config = self.model_config
    speculative_args = {} if speculative_config is None \
        or (speculative_config.draft_model_config.model ==
            model_config.model) \
        or (speculative_config.draft_model_config.hf_config.model_type
            not in ["medusa", "mlp_speculator", "eagle"]) \
                else {"return_hidden_states": True}
    ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
    if self.model_config.runner_type == "pooling":
        ModelRunnerClass = CPUPoolingModelRunner
    elif self.model_config.is_encoder_decoder:
        ModelRunnerClass = CPUEncoderDecoderModelRunner
    self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
        vllm_config=vllm_config,
        kv_cache_dtype=kv_cache_dtype,
        is_driver_worker=is_driver_worker,
        **speculative_args,
    )
    if model_runner_cls is not None:
        self.model_runner = model_runner_cls(self.model_runner)
    # Uninitialized cache engine. Will be initialized by
    # initialize_cache.
    self.cache_engine: List[CPUCacheEngine]
    # Initialize cpu_cache as pooling models don't initialize kv_caches
    self.cpu_cache: Optional[List[List[torch.Tensor]]] = None

    # Torch profiler. Enabled and configured through env vars:
    # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
    if envs.VLLM_TORCH_PROFILER_DIR:
        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
        logger.info("Profiling enabled. Traces will be saved to: %s",
                    torch_profiler_trace_dir)
        self.profiler = torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
            ],
            with_stack=True,
            on_trace_ready=torch.profiler.tensorboard_trace_handler(
                torch_profiler_trace_dir, use_gzip=True))
    else:
        self.profiler = None

_init_cache_engine ¶

_init_cache_engine() -> None

Source code in vllm/worker/cpu_worker.py

def _init_cache_engine(self) -> None:
    self.cache_engine = [
        CPUCacheEngine(self.cache_config, self.model_config,
                       self.parallel_config, self.device_config)
        for _ in range(self.parallel_config.pipeline_parallel_size)
    ]
    self.cpu_cache = [
        self.cache_engine[ve].cpu_cache
        for ve in range(self.parallel_config.pipeline_parallel_size)
    ]
    bind_kv_cache(self.compilation_config.static_forward_context,
                  self.cpu_cache)
    self.model_runner.block_size = self.cache_engine[0].block_size

    assert all(
        self.cpu_cache[ve] is not None
        for ve in range(self.parallel_config.pipeline_parallel_size))

    # Populate the cache to warmup the memory
    for ve in range(self.parallel_config.pipeline_parallel_size):
        for layer_cache in self.cpu_cache[ve]:
            layer_cache.fill_(0)

_validate_num_cpu_blocks ¶

_validate_num_cpu_blocks(num_cpu_blocks: int) -> None

Raise errors if the num_cpu_blocks is invalid.

Source code in vllm/worker/cpu_worker.py

def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
    """Raise errors if the num_cpu_blocks is invalid.
    """
    if num_cpu_blocks <= 0:
        raise ValueError("No available memory for the cache blocks. "
                         "Try increasing `VLLM_CPU_KVCACHE_SPACE` when "
                         "initializing the engine.")

    max_seq_len = self.cache_config.block_size * num_cpu_blocks
    if self.model_config.max_model_len > max_seq_len:
        raise ValueError(
            f"The model's max seq len ({self.model_config.max_model_len}) "
            "is larger than the maximum number of tokens that can be "
            f"stored in KV cache ({max_seq_len}). Try increasing "
            "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when "
            "initializing the engine.")

add_lora ¶

add_lora(lora_request: LoRARequest) -> bool

Source code in vllm/worker/cpu_worker.py

def add_lora(self, lora_request: LoRARequest) -> bool:
    return self.model_runner.add_lora(lora_request)

determine_num_available_blocks ¶

determine_num_available_blocks() -> Tuple[int, int]

Determine the number of blocks available for the KV cache.

This determines how many KV blocks can fit into the configured CPU KV cache space.

Note that since vLLM assumes a block resides on GPU if it can be modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. This allows us to reuse the scheduler of vLLM without generalizing it to different devices.

Source code in vllm/worker/cpu_worker.py

def determine_num_available_blocks(self) -> Tuple[int, int]:
    """Determine the number of blocks available for the KV cache.

    This determines how many KV blocks can fit into the configured CPU
    KV cache space.

    Note that since vLLM assumes a block resides on GPU if it can be
    modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0.
    This allows us to reuse the scheduler of vLLM without generalizing it
    to different devices.
    """
    # For CPU device, the block number will be calculated based on the
    # cpu_kvcache_space.
    cache_block_size = self.get_cache_block_size_bytes()
    num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes //
                         cache_block_size)
    num_cpu_blocks = max(num_cpu_blocks, 0)

    # Note: To reuse the cache management procedure,
    # use cpu cache as 'gpu cache'.
    num_gpu_blocks = num_cpu_blocks
    num_cpu_blocks = 0
    return num_gpu_blocks, num_cpu_blocks

execute_worker ¶

execute_worker(worker_input: WorkerInput) -> None

Source code in vllm/worker/cpu_worker.py

def execute_worker(
    self,
    worker_input: WorkerInput,
) -> None:
    if (worker_input.blocks_to_copy is not None
            and worker_input.blocks_to_copy.numel() > 0):
        self.cache_engine[worker_input.virtual_engine].copy(
            worker_input.blocks_to_copy)

get_cache_block_size_bytes ¶

get_cache_block_size_bytes() -> int

Return the size in bytes of a single KV cache block.

Source code in vllm/worker/cpu_worker.py

def get_cache_block_size_bytes(self) -> int:
    """Return the size in bytes of a single KV cache block.
    """
    return CPUCacheEngine.get_cache_block_size(self.cache_config,
                                               self.model_config,
                                               self.parallel_config)

get_cpus_id_binding_based_on_numa_nodes ¶

get_cpus_id_binding_based_on_numa_nodes() -> str

Return CPUs id binding based on NUMA nodes.

Source code in vllm/worker/cpu_worker.py

def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
    """Return CPUs id binding based on NUMA nodes.
    """
    rank_to_cpus = self.local_omp_cpuid
    # Setup OpenMP thread affinity based on NUMA nodes automatically
    world_size = self.vllm_config.parallel_config.world_size
    libnuma_found = util.find_spec("numa") is not None
    psutil_found = util.find_spec("psutil") is not None
    if libnuma_found and psutil_found:
        import psutil
        from numa import info
        cpu_count = psutil.cpu_count(logical=False)
        cpus_allow_list = psutil.Process().cpu_affinity()
        numa_size = info.get_num_configured_nodes()
        cpu_count_per_numa = cpu_count // numa_size
        num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
                                  cpu_count_per_numa // 2)

        # check allow node_to_cpus list
        node_to_cpus = []
        for i in range(numa_size):
            node_intersect = set(
                info.node_to_cpus(i)).intersection(cpus_allow_list)
            if bool(node_intersect):
                node_to_cpus.append(list(node_intersect))

        if world_size > len(node_to_cpus):
            logger.error(
                "Auto thread-binding failed due to "
                "world size: %d is larger than "
                "allowed NUMA nodes number: %d."
                "Please try to bind threads manually.", world_size,
                len(node_to_cpus))
        else:
            end = cpu_count_per_numa - num_of_reserved_cpu
            rank_to_cpus_list = node_to_cpus[self.rank][:end]
            rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
            logger.info("auto thread-binding list: %s", rank_to_cpus)
    else:
        logger.warning(
            "Auto thread-binding is not supported due to "
            "the lack of package numa and psutil,"
            "fallback to no thread-binding. To get better performance,"
            "please try to manually bind threads.")
    return rank_to_cpus

init_device ¶

init_device() -> None

Source code in vllm/worker/cpu_worker.py

def init_device(self) -> None:
    if self.local_omp_cpuid != "all":
        ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
        if ret:
            logger.info(ret)

    # Note: unique identifier for creating allreduce shared memory
    os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
        ":")[-1]
    self.device = torch.device("cpu")
    self.init_distributed_environment()
    # Set random seed.
    set_random_seed(self.model_config.seed)

init_distributed_environment ¶

init_distributed_environment() -> None

Initialize the distributed environment.

Source code in vllm/worker/cpu_worker.py

def init_distributed_environment(self) -> None:
    """Initialize the distributed environment."""

    parallel_config = self.parallel_config
    rank = self.rank
    distributed_init_method = self.distributed_init_method
    init_distributed_environment(
        world_size=parallel_config.world_size,
        rank=rank,
        distributed_init_method=distributed_init_method,
        backend="gloo",
    )

    # A small all_reduce for warmup.
    torch.distributed.all_reduce(torch.zeros(1).cpu())

    ensure_model_parallel_initialized(
        parallel_config.tensor_parallel_size,
        parallel_config.pipeline_parallel_size)

initialize_cache ¶

initialize_cache(
    num_gpu_blocks: int, num_cpu_blocks: int
) -> None

Initialize the KV cache. Currently, swappable CPU memory is not supported.

Since this worker does not support GPUs, we use the num_gpu_blocks to determine how many non-swappable CPU blocks to allocate.

Source code in vllm/worker/cpu_worker.py

def initialize_cache(self, num_gpu_blocks: int,
                     num_cpu_blocks: int) -> None:
    """Initialize the KV cache. Currently, swappable CPU memory is not
    supported.

    Since this worker does not support GPUs, we use the num_gpu_blocks to
    determine how many non-swappable CPU blocks to allocate.
    """
    assert (num_cpu_blocks == 0
            ), f"{type(self)} does not support swappable cache"

    # Note: To reuse the cache management procedure,
    # use cpu cache as 'gpu cache'.
    num_cpu_blocks = num_gpu_blocks

    self._validate_num_cpu_blocks(num_cpu_blocks)
    self.cache_config.num_gpu_blocks = num_cpu_blocks
    self.cache_config.num_cpu_blocks = 0

    # Initialize the cache.
    self._init_cache_engine()

list_loras ¶

list_loras() -> Set[int]

Source code in vllm/worker/cpu_worker.py

def list_loras(self) -> Set[int]:
    return self.model_runner.list_loras()

load_model ¶

load_model()

Source code in vllm/worker/cpu_worker.py

def load_model(self):
    self.model_runner.load_model()

pin_lora ¶

pin_lora(lora_id: int) -> bool

Source code in vllm/worker/cpu_worker.py

def pin_lora(self, lora_id: int) -> bool:
    return self.model_runner.pin_lora(lora_id)

prepare_worker_input ¶

prepare_worker_input(
    execute_model_req: ExecuteModelRequest,
) -> WorkerInput

Source code in vllm/worker/cpu_worker.py

@torch.inference_mode()
def prepare_worker_input(
        self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
    assert execute_model_req is not None
    virtual_engine: int = execute_model_req.virtual_engine
    num_seq_groups: int = len(execute_model_req.seq_group_metadata_list)
    blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                  device="cpu",
                                  dtype=torch.int64).view(-1, 2)
    assert len(execute_model_req.blocks_to_swap_in) == 0
    assert len(execute_model_req.blocks_to_swap_out) == 0
    return WorkerInput(
        num_seq_groups=num_seq_groups,
        blocks_to_copy=blocks_to_copy,
        virtual_engine=virtual_engine,
    )

remove_lora ¶

remove_lora(lora_id: int) -> bool

Source code in vllm/worker/cpu_worker.py

def remove_lora(self, lora_id: int) -> bool:
    return self.model_runner.remove_lora(lora_id)

start_profile ¶

start_profile()

Source code in vllm/worker/cpu_worker.py

def start_profile(self):
    if self.profiler is None:
        raise RuntimeError("Profiler is not enabled.")
    self.profiler.start()

stop_profile ¶

stop_profile()

Source code in vllm/worker/cpu_worker.py

def stop_profile(self):
    if self.profiler is None:
        raise RuntimeError("Profiler is not enabled.")
    self.profiler.stop()

vllm.worker.cpu_worker

logger module-attribute ¶

CPUCacheEngine ¶

attn_backend instance-attribute ¶

block_size instance-attribute ¶

cache_config instance-attribute ¶

cpu_cache instance-attribute ¶

dtype instance-attribute ¶

head_size instance-attribute ¶

model_config instance-attribute ¶

num_cpu_blocks instance-attribute ¶

num_heads instance-attribute ¶

num_layers instance-attribute ¶

parallel_config instance-attribute ¶

__init__ ¶

_allocate_kv_cache ¶

copy ¶

get_cache_block_size staticmethod ¶

get_kv_cache_dtype staticmethod ¶

swap_in ¶

swap_out ¶

CPUWorker ¶

cache_engine instance-attribute ¶

cpu_cache instance-attribute ¶

distributed_init_method instance-attribute ¶

do_metadata_broadcast property ¶

is_driver_worker instance-attribute ¶

kv_cache property ¶

local_omp_cpuid instance-attribute ¶

local_rank instance-attribute ¶

max_model_len property ¶

model_runner instance-attribute ¶

profiler instance-attribute ¶

rank instance-attribute ¶

vocab_size property ¶

__init__ ¶

_init_cache_engine ¶

_validate_num_cpu_blocks ¶

add_lora ¶

determine_num_available_blocks ¶

execute_worker ¶

get_cache_block_size_bytes ¶

get_cpus_id_binding_based_on_numa_nodes ¶

init_device ¶

init_distributed_environment ¶

initialize_cache ¶

list_loras ¶

load_model ¶

pin_lora ¶

prepare_worker_input ¶

remove_lora ¶

start_profile ¶

stop_profile ¶

logger `module-attribute` ¶

attn_backend `instance-attribute` ¶

block_size `instance-attribute` ¶

cache_config `instance-attribute` ¶

cpu_cache `instance-attribute` ¶

dtype `instance-attribute` ¶

head_size `instance-attribute` ¶

model_config `instance-attribute` ¶

num_cpu_blocks `instance-attribute` ¶

num_heads `instance-attribute` ¶

num_layers `instance-attribute` ¶

parallel_config `instance-attribute` ¶

init ¶

get_cache_block_size `staticmethod` ¶

get_kv_cache_dtype `staticmethod` ¶

cache_engine `instance-attribute` ¶

cpu_cache `instance-attribute` ¶

distributed_init_method `instance-attribute` ¶

do_metadata_broadcast `property` ¶

is_driver_worker `instance-attribute` ¶

kv_cache `property` ¶

local_omp_cpuid `instance-attribute` ¶

local_rank `instance-attribute` ¶

max_model_len `property` ¶

model_runner `instance-attribute` ¶

profiler `instance-attribute` ¶

rank `instance-attribute` ¶

vocab_size `property` ¶

init ¶