vllm.v1.spec_decode.ngram_proposer

NgramProposer ¶

Source code in vllm/v1/spec_decode/ngram_proposer.py

class NgramProposer:

    def __init__(self, vllm_config: VllmConfig):
        # Minimum length of the n-gram to match.
        self.min_n = vllm_config.speculative_config.prompt_lookup_min
        # Maximum length of the n-gram to match.
        self.max_n = vllm_config.speculative_config.prompt_lookup_max
        # Number of tokens follow the match. If there are less than k
        # tokens follow the match, we will return the maximum amount of
        # tokens until the end.
        self.k = vllm_config.speculative_config.num_speculative_tokens
        # Maximum length of the model.
        self.max_model_len = vllm_config.model_config.max_model_len

        # Trigger Numba JIT compilation for N-gram proposer.
        # This usually takes less than 1 second.
        self.propose(np.zeros(1024, dtype=np.int32))

    def propose(
        self,
        context_token_ids: np.ndarray,
    ) -> Optional[np.ndarray]:
        """Proposes the next sequence of tokens based on n-gram pattern 
        matching in the context. The function finds matches of the last n 
        tokens in the previous context, and returns k tokens that followed 
        that match.

        Args:
            context_token_ids: Numpy array of token IDs representing the 
                               context sequence.

        Returns:
            np.ndarray: The sequence of tokens that followed 
                        the matched n-gram in the context.
            None: If no matching n-gram pattern is found.

        Example:
            If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
            k = 4:
            - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
            - The last 2 tokens [2,3] will be matched against the previous 
              4 tokens [1,2,3,4].
            - Finding a match of [2,3] would return the tokens that 
              followed that pattern. Here we will return [4,2,3] because 
              we only have three tokens after the match.
        """
        # Do not generate draft tokens beyond the max model length.
        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
        if k <= 0:
            return None

        # TODO(woosuk): Optimize this.
        for n in range(self.max_n, self.min_n - 1, -1):
            result = _find_subarray_kmp(context_token_ids, n, k)
            if result is not None:
                return result
        return None

    def load_model(self, *args, **kwargs):
        # No model to load.
        pass

k `instance-attribute` ¶

k = num_speculative_tokens

max_model_len `instance-attribute` ¶

max_model_len = max_model_len

max_n `instance-attribute` ¶

max_n = prompt_lookup_max

min_n `instance-attribute` ¶

min_n = prompt_lookup_min

init ¶

__init__(vllm_config: VllmConfig)

Source code in vllm/v1/spec_decode/ngram_proposer.py

def __init__(self, vllm_config: VllmConfig):
    # Minimum length of the n-gram to match.
    self.min_n = vllm_config.speculative_config.prompt_lookup_min
    # Maximum length of the n-gram to match.
    self.max_n = vllm_config.speculative_config.prompt_lookup_max
    # Number of tokens follow the match. If there are less than k
    # tokens follow the match, we will return the maximum amount of
    # tokens until the end.
    self.k = vllm_config.speculative_config.num_speculative_tokens
    # Maximum length of the model.
    self.max_model_len = vllm_config.model_config.max_model_len

    # Trigger Numba JIT compilation for N-gram proposer.
    # This usually takes less than 1 second.
    self.propose(np.zeros(1024, dtype=np.int32))

load_model ¶

load_model(*args, **kwargs)

Source code in vllm/v1/spec_decode/ngram_proposer.py

def load_model(self, *args, **kwargs):
    # No model to load.
    pass

propose ¶

propose(context_token_ids: ndarray) -> Optional[ndarray]

Proposes the next sequence of tokens based on n-gram pattern matching in the context. The function finds matches of the last n tokens in the previous context, and returns k tokens that followed that match.

Parameters:

Name	Type	Description	Default
`context_token_ids`	`ndarray`	Numpy array of token IDs representing the context sequence.	required

Returns:

Name	Type	Description
	`Optional[ndarray]`	np.ndarray: The sequence of tokens that followed the matched n-gram in the context.
`None`	`Optional[ndarray]`	If no matching n-gram pattern is found.

Example

If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and k = 4: - The last 3 (= max_n) tokens [4,2,3] cannot find a match. - The last 2 tokens [2,3] will be matched against the previous 4 tokens [1,2,3,4]. - Finding a match of [2,3] would return the tokens that followed that pattern. Here we will return [4,2,3] because we only have three tokens after the match.

Source code in vllm/v1/spec_decode/ngram_proposer.py

def propose(
    self,
    context_token_ids: np.ndarray,
) -> Optional[np.ndarray]:
    """Proposes the next sequence of tokens based on n-gram pattern 
    matching in the context. The function finds matches of the last n 
    tokens in the previous context, and returns k tokens that followed 
    that match.

    Args:
        context_token_ids: Numpy array of token IDs representing the 
                           context sequence.

    Returns:
        np.ndarray: The sequence of tokens that followed 
                    the matched n-gram in the context.
        None: If no matching n-gram pattern is found.

    Example:
        If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
        k = 4:
        - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
        - The last 2 tokens [2,3] will be matched against the previous 
          4 tokens [1,2,3,4].
        - Finding a match of [2,3] would return the tokens that 
          followed that pattern. Here we will return [4,2,3] because 
          we only have three tokens after the match.
    """
    # Do not generate draft tokens beyond the max model length.
    k = min(self.k, self.max_model_len - context_token_ids.shape[0])
    if k <= 0:
        return None

    # TODO(woosuk): Optimize this.
    for n in range(self.max_n, self.min_n - 1, -1):
        result = _find_subarray_kmp(context_token_ids, n, k)
        if result is not None:
            return result
    return None

_find_subarray_kmp ¶

_find_subarray_kmp(
    context_token_ids: ndarray, n: int, k: int
) -> Optional[ndarray]

Source code in vllm/v1/spec_decode/ngram_proposer.py

@jit(nopython=True)
def _find_subarray_kmp(
    context_token_ids: np.ndarray,
    n: int,
    k: int,
) -> Optional[np.ndarray]:
    context_len = context_token_ids.shape[0]
    assert n > 0

    pattern = context_token_ids[-n:]
    # Precompute lps array for Y
    lps = _kmp_lps_array(pattern)

    i = 0
    j = 0
    # -n because the last n tokens are used as pattern
    while i < context_len - n:
        if context_token_ids[i] == pattern[j]:
            i += 1
            j += 1

            # If we have matched the entire Y
            if j == n:
                # Found pattern in context, gather the next K elements
                return context_token_ids[i:i + k]
        else:
            # Mismatch
            if j != 0:
                # Use the lps array to avoid re-checking elements
                j = lps[j - 1]
            else:
                i += 1

    # Y not found
    return None

_kmp_lps_array ¶

_kmp_lps_array(pattern: ndarray) -> ndarray

Build the lps (longest proper prefix which is also suffix) array for the pattern.

Source code in vllm/v1/spec_decode/ngram_proposer.py

@jit(nopython=True)
def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
    """
    Build the lps (longest proper prefix which is also suffix) 
    array for the pattern.
    """
    lps = np.zeros(len(pattern), dtype=np.int32)
    prev_lps = 0  # length of the previous longest prefix suffix
    i = 1

    while i < len(pattern):
        if pattern[i] == pattern[prev_lps]:
            prev_lps += 1
            lps[i] = prev_lps
            i += 1
        else:
            if prev_lps != 0:
                prev_lps = lps[prev_lps - 1]
            else:
                lps[i] = 0
                i += 1
    return lps

vllm.v1.spec_decode.ngram_proposer

NgramProposer ¶

k instance-attribute ¶

max_model_len instance-attribute ¶

max_n instance-attribute ¶

min_n instance-attribute ¶

__init__ ¶

load_model ¶

propose ¶

_find_subarray_kmp ¶

_kmp_lps_array ¶

k `instance-attribute` ¶

max_model_len `instance-attribute` ¶

max_n `instance-attribute` ¶

min_n `instance-attribute` ¶

init ¶