Skip to content

vllm.v1.worker.cpu_model_runner

logger module-attribute

logger = init_logger(__name__)

CPUModelRunner

Bases: GPUModelRunner

Source code in vllm/v1/worker/cpu_model_runner.py
class CPUModelRunner(GPUModelRunner):

    def __init__(self, vllm_config: VllmConfig, device: torch.device):
        super().__init__(vllm_config, device)

        assert device == torch.device("cpu")
        assert self.speculative_config is None, "spec decode is not supported."

        self.use_cuda_graph = False
        self.cascade_attn_enabled = False

        self._postprocess_tenosrs()

    def _postprocess_tenosrs(self) -> None:
        # Note: replace device tensors with cpu tensors
        def replace_tensor(obj: Any, cpu_attr_name: str,
                           device_attr_name) -> None:
            cpu_tensor = getattr(obj, cpu_attr_name, None)
            device_tensor = getattr(obj, device_attr_name, None)
            if cpu_tensor is not None and device_tensor is not None:
                assert isinstance(cpu_tensor, torch.Tensor)
                assert isinstance(device_tensor, torch.Tensor)
                setattr(obj, device_attr_name, cpu_tensor)

        for k, v in vars(self).items():
            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
                replace_tensor(self, k, k[:-4])

        for k, v in vars(self.input_batch).items():
            if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
                replace_tensor(self.input_batch, k, k[:-11])

        for k, v in vars(self.input_batch.block_table).items():
            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
                replace_tensor(self.input_batch.block_table, k, k[:-4])

    def load_model(self) -> None:
        logger.info("Starting to load model %s...", self.model_config.model)
        self.model = get_model(vllm_config=self.vllm_config)

        if has_step_pooler(self.model):
            self.input_batch.logits_processing_needs_token_ids = True

        if self.lora_config:
            self.model = self.load_lora_model(self.model, self.model_config,
                                              self.scheduler_config,
                                              self.lora_config, self.device)

    def warming_up_model(self) -> None:
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
        with _set_global_compilation_settings(self.vllm_config):
            self._dummy_run(max(16, self.max_num_reqs))
        logger.info("Warming up done.")

    def _init_device_properties(self) -> None:
        pass

    def _sync_device(self) -> None:
        pass

cascade_attn_enabled instance-attribute

cascade_attn_enabled = False

use_cuda_graph instance-attribute

use_cuda_graph = False

__init__

__init__(vllm_config: VllmConfig, device: device)
Source code in vllm/v1/worker/cpu_model_runner.py
def __init__(self, vllm_config: VllmConfig, device: torch.device):
    super().__init__(vllm_config, device)

    assert device == torch.device("cpu")
    assert self.speculative_config is None, "spec decode is not supported."

    self.use_cuda_graph = False
    self.cascade_attn_enabled = False

    self._postprocess_tenosrs()

_init_device_properties

_init_device_properties() -> None
Source code in vllm/v1/worker/cpu_model_runner.py
def _init_device_properties(self) -> None:
    pass

_postprocess_tenosrs

_postprocess_tenosrs() -> None
Source code in vllm/v1/worker/cpu_model_runner.py
def _postprocess_tenosrs(self) -> None:
    # Note: replace device tensors with cpu tensors
    def replace_tensor(obj: Any, cpu_attr_name: str,
                       device_attr_name) -> None:
        cpu_tensor = getattr(obj, cpu_attr_name, None)
        device_tensor = getattr(obj, device_attr_name, None)
        if cpu_tensor is not None and device_tensor is not None:
            assert isinstance(cpu_tensor, torch.Tensor)
            assert isinstance(device_tensor, torch.Tensor)
            setattr(obj, device_attr_name, cpu_tensor)

    for k, v in vars(self).items():
        if k.endswith("_cpu") and isinstance(v, torch.Tensor):
            replace_tensor(self, k, k[:-4])

    for k, v in vars(self.input_batch).items():
        if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
            replace_tensor(self.input_batch, k, k[:-11])

    for k, v in vars(self.input_batch.block_table).items():
        if k.endswith("_cpu") and isinstance(v, torch.Tensor):
            replace_tensor(self.input_batch.block_table, k, k[:-4])

_sync_device

_sync_device() -> None
Source code in vllm/v1/worker/cpu_model_runner.py
def _sync_device(self) -> None:
    pass

load_model

load_model() -> None
Source code in vllm/v1/worker/cpu_model_runner.py
def load_model(self) -> None:
    logger.info("Starting to load model %s...", self.model_config.model)
    self.model = get_model(vllm_config=self.vllm_config)

    if has_step_pooler(self.model):
        self.input_batch.logits_processing_needs_token_ids = True

    if self.lora_config:
        self.model = self.load_lora_model(self.model, self.model_config,
                                          self.scheduler_config,
                                          self.lora_config, self.device)

warming_up_model

warming_up_model() -> None
Source code in vllm/v1/worker/cpu_model_runner.py
def warming_up_model(self) -> None:
    logger.info("Warming up model for the compilation...")
    # Only generate graph for the generic shape
    with _set_global_compilation_settings(self.vllm_config):
        self._dummy_run(max(16, self.max_num_reqs))
    logger.info("Warming up done.")

_set_global_compilation_settings

_set_global_compilation_settings(config: VllmConfig)
Source code in vllm/v1/worker/cpu_model_runner.py
@contextmanager
def _set_global_compilation_settings(config: VllmConfig):
    import torch._inductor.config

    inductor_config = config.compilation_config.inductor_compile_config
    try:
        # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
        freezing_value = torch._inductor.config.freezing
        if inductor_config.get("max_autotune", False):
            torch._inductor.config.freezing = True
        yield
    finally:
        torch._inductor.config.freezing = freezing_value