vllm.spec_decode.proposer_worker_base

NonLLMProposerWorkerBase ¶

Bases: ProposerWorkerBase, ABC

Proposer worker which does not use a model with kvcache

Source code in vllm/spec_decode/proposer_worker_base.py

class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC):
    """Proposer worker which does not use a model with kvcache"""

    def execute_model(
        self,
        execute_model_req: Optional[ExecuteModelRequest] = None
    ) -> List[SamplerOutput]:
        """get_spec_proposals is used to get the proposals"""
        return []

    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """This is never called on the proposer, only the target model"""
        raise NotImplementedError

    def initialize_cache(self, num_gpu_blocks: int,
                         num_cpu_blocks: int) -> None:
        pass

    def get_cache_block_size_bytes(self) -> int:
        return 0

determine_num_available_blocks ¶

determine_num_available_blocks() -> Tuple[int, int]

This is never called on the proposer, only the target model

Source code in vllm/spec_decode/proposer_worker_base.py

def determine_num_available_blocks(self) -> Tuple[int, int]:
    """This is never called on the proposer, only the target model"""
    raise NotImplementedError

execute_model ¶

execute_model(
    execute_model_req: Optional[ExecuteModelRequest] = None,
) -> List[SamplerOutput]

get_spec_proposals is used to get the proposals

Source code in vllm/spec_decode/proposer_worker_base.py

def execute_model(
    self,
    execute_model_req: Optional[ExecuteModelRequest] = None
) -> List[SamplerOutput]:
    """get_spec_proposals is used to get the proposals"""
    return []

get_cache_block_size_bytes ¶

get_cache_block_size_bytes() -> int

Source code in vllm/spec_decode/proposer_worker_base.py

def get_cache_block_size_bytes(self) -> int:
    return 0

initialize_cache ¶

initialize_cache(
    num_gpu_blocks: int, num_cpu_blocks: int
) -> None

Source code in vllm/spec_decode/proposer_worker_base.py

def initialize_cache(self, num_gpu_blocks: int,
                     num_cpu_blocks: int) -> None:
    pass

ProposerWorkerBase ¶

Bases: LoRANotSupportedWorkerBase, SpeculativeProposer

Interface for proposer workers

Source code in vllm/spec_decode/proposer_worker_base.py

class ProposerWorkerBase(LoRANotSupportedWorkerBase, SpeculativeProposer):
    """Interface for proposer workers"""

    @abstractmethod
    def sampler_output(
        self,
        execute_model_req: ExecuteModelRequest,
        sample_len: int,
        # A set containing all sequence IDs that were assigned bonus tokens
        # in their last forward pass. This set is used to backfill the KV cache
        # with the key-value pairs of the penultimate token in the sequences.
        # This parameter is only used by the MultiStepWorker, which relies on
        # the KV cache for token generation. It is not used by workers that
        # do not utilize the KV cache.
        seq_ids_with_bonus_token_in_last_step: Set[int]
    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
        raise NotImplementedError

    def set_include_gpu_probs_tensor(self) -> None:
        """Implementation optional"""
        pass

    def set_should_modify_greedy_probs_inplace(self) -> None:
        """Implementation optional"""
        pass

sampler_output `abstractmethod` ¶

sampler_output(
    execute_model_req: ExecuteModelRequest,
    sample_len: int,
    seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[Optional[List[SamplerOutput]], bool]

Source code in vllm/spec_decode/proposer_worker_base.py

@abstractmethod
def sampler_output(
    self,
    execute_model_req: ExecuteModelRequest,
    sample_len: int,
    # A set containing all sequence IDs that were assigned bonus tokens
    # in their last forward pass. This set is used to backfill the KV cache
    # with the key-value pairs of the penultimate token in the sequences.
    # This parameter is only used by the MultiStepWorker, which relies on
    # the KV cache for token generation. It is not used by workers that
    # do not utilize the KV cache.
    seq_ids_with_bonus_token_in_last_step: Set[int]
) -> Tuple[Optional[List[SamplerOutput]], bool]:
    raise NotImplementedError

set_include_gpu_probs_tensor ¶

set_include_gpu_probs_tensor() -> None

Implementation optional

Source code in vllm/spec_decode/proposer_worker_base.py

def set_include_gpu_probs_tensor(self) -> None:
    """Implementation optional"""
    pass

set_should_modify_greedy_probs_inplace ¶

set_should_modify_greedy_probs_inplace() -> None

Implementation optional

Source code in vllm/spec_decode/proposer_worker_base.py

def set_should_modify_greedy_probs_inplace(self) -> None:
    """Implementation optional"""
    pass

vllm.spec_decode.proposer_worker_base

NonLLMProposerWorkerBase ¶

determine_num_available_blocks ¶

execute_model ¶

get_cache_block_size_bytes ¶

initialize_cache ¶

ProposerWorkerBase ¶

sampler_output abstractmethod ¶

set_include_gpu_probs_tensor ¶

set_should_modify_greedy_probs_inplace ¶

sampler_output `abstractmethod` ¶