vllm.v1.sample.tpu.sampler ¶

Sampler layer implementing TPU supported operations.

_SAMPLING_EPS `module-attribute` ¶

_SAMPLING_EPS = 1e-05

Sampler ¶

Bases: Module

Source code in vllm/v1/sample/tpu/sampler.py

class Sampler(nn.Module):
    def __init__(self):
        # TODO(houseroad): Add support for logprobs_mode.
        super().__init__()

    def forward(
        self,
        logits: torch.Tensor,
        sampling_metadata: TPUSupportedSamplingMetadata,
    ) -> SamplerOutput:
        # Use float32 for the logits.
        logits = logits.to(torch.float32)
        # Sample the next token.
        sampled = self.sample(logits, sampling_metadata)

        # These are TPU tensors.
        sampler_output = SamplerOutput(
            # The sampled tokens are expanded to 2D tensor with shape
            # [num_requests, 1], where each row represents one generated
            # token per request.
            sampled_token_ids=sampled.unsqueeze(-1),
            logprobs_tensors=None,
        )
        return sampler_output

    def apply_temperature(
        self,
        logits: torch.Tensor,
        temp: torch.Tensor,
        all_random: bool = False,
    ) -> torch.Tensor:
        # Avoid division by zero for greedy sampling (temperature ~ 0.0).
        if not all_random:
            temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
        return logits.div_(temp.unsqueeze(dim=1))

    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
        return logits.argmax(dim=-1).view(-1)

    def sample(
        self,
        logits: torch.Tensor,
        sampling_metadata: TPUSupportedSamplingMetadata,
    ) -> torch.Tensor:
        greedy_sampled = self.greedy_sample(logits)

        assert sampling_metadata.temperature is not None

        # Apply temperature.
        logits = self.apply_temperature(
            logits, sampling_metadata.temperature, sampling_metadata.all_random
        )

        # Apply min_p.
        if sampling_metadata.min_p is not None:
            logits = self.apply_min_p(logits, sampling_metadata.min_p)

        # Apply top_k and/or top_p.
        logits = apply_top_k_top_p(
            logits,
            sampling_metadata.top_k,
            sampling_metadata.top_p,
        )

        # Random sample.
        probs = logits.softmax(dim=-1, dtype=torch.float32)
        random_sampled = self.random_sample(probs, sampling_metadata.generators)

        sampled = torch.where(
            sampling_metadata.temperature < _SAMPLING_EPS,
            greedy_sampled,
            random_sampled,
        )
        return sampled

    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
        return logits.log_softmax(dim=-1, dtype=torch.float32)

    def gather_logprobs(
        self,
        logprobs: torch.Tensor,
        num_logprobs: int,
        token_ids: torch.Tensor,
    ) -> LogprobsTensors:
        """
        Gather logprobs for topk and sampled/prompt token.

        Args:
          logprobs: (num tokens) x (vocab) tensor
          num_logprobs: minimum number of logprobs to
                        retain per token
          token_ids: prompt tokens (if prompt logprobs)
                     or sampled tokens (if sampled
                     logprobs); 1D token ID tensor
                     with (num tokens) elements

        Returns:
          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
          Sampled token rank tensor, (num tokens)
        """
        # Find the topK values.
        topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)

        # Get with the logprob of the prompt or sampled token.
        token_ids = token_ids.unsqueeze(-1)
        token_logprobs = logprobs.gather(-1, token_ids)

        # Compute the ranks of the actual token.
        token_ranks = (logprobs >= token_logprobs).sum(-1)

        # Concatenate together with the topk.
        indices = torch.cat((token_ids, topk_indices), dim=1)
        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)

        # Use int32 to reduce the tensor size.
        indices = indices.to(torch.int32)

        return LogprobsTensors(indices, logprobs, token_ranks)

    def apply_min_p(
        self,
        logits: torch.Tensor,
        min_p: torch.Tensor,
    ) -> torch.Tensor:
        """
        Filters logits using adaptive probability thresholding.
        """
        # Convert logits to probability distribution
        probability_values = torch.nn.functional.softmax(logits, dim=-1)
        # Calculate maximum probabilities per sequence
        max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
        # Reshape min_p for broadcasting
        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
        # Identify valid tokens using threshold comparison
        valid_token_mask = probability_values >= adjusted_min_p
        # Apply mask using boolean indexing (xla friendly)
        logits.masked_fill_(~valid_token_mask, -float("inf"))
        return logits

    def random_sample(
        self,
        probs: torch.Tensor,
        generators: dict[int, torch.Generator],
    ) -> torch.Tensor:
        q = torch.empty_like(probs)
        # NOTE(woosuk): To batch-process the requests without their own seeds,
        # which is the common case, we first assume that every request does
        # not have its own seed. Then, we overwrite the values for the requests
        # that have their own seeds.
        q.exponential_()
        if generators:
            for i, generator in generators.items():
                q[i].exponential_(generator=generator)
        return probs.div_(q).argmax(dim=-1).view(-1)

init ¶

__init__()

Source code in vllm/v1/sample/tpu/sampler.py

def __init__(self):
    # TODO(houseroad): Add support for logprobs_mode.
    super().__init__()

apply_min_p ¶

apply_min_p(logits: Tensor, min_p: Tensor) -> Tensor

Filters logits using adaptive probability thresholding.

Source code in vllm/v1/sample/tpu/sampler.py

def apply_min_p(
    self,
    logits: torch.Tensor,
    min_p: torch.Tensor,
) -> torch.Tensor:
    """
    Filters logits using adaptive probability thresholding.
    """
    # Convert logits to probability distribution
    probability_values = torch.nn.functional.softmax(logits, dim=-1)
    # Calculate maximum probabilities per sequence
    max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
    # Reshape min_p for broadcasting
    adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
    # Identify valid tokens using threshold comparison
    valid_token_mask = probability_values >= adjusted_min_p
    # Apply mask using boolean indexing (xla friendly)
    logits.masked_fill_(~valid_token_mask, -float("inf"))
    return logits

apply_temperature ¶

apply_temperature(
    logits: Tensor, temp: Tensor, all_random: bool = False
) -> Tensor

Source code in vllm/v1/sample/tpu/sampler.py

def apply_temperature(
    self,
    logits: torch.Tensor,
    temp: torch.Tensor,
    all_random: bool = False,
) -> torch.Tensor:
    # Avoid division by zero for greedy sampling (temperature ~ 0.0).
    if not all_random:
        temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
    return logits.div_(temp.unsqueeze(dim=1))

compute_logprobs ¶

compute_logprobs(logits: Tensor) -> Tensor

Source code in vllm/v1/sample/tpu/sampler.py

def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
    return logits.log_softmax(dim=-1, dtype=torch.float32)

forward ¶

forward(
    logits: Tensor,
    sampling_metadata: TPUSupportedSamplingMetadata,
) -> SamplerOutput

Source code in vllm/v1/sample/tpu/sampler.py

def forward(
    self,
    logits: torch.Tensor,
    sampling_metadata: TPUSupportedSamplingMetadata,
) -> SamplerOutput:
    # Use float32 for the logits.
    logits = logits.to(torch.float32)
    # Sample the next token.
    sampled = self.sample(logits, sampling_metadata)

    # These are TPU tensors.
    sampler_output = SamplerOutput(
        # The sampled tokens are expanded to 2D tensor with shape
        # [num_requests, 1], where each row represents one generated
        # token per request.
        sampled_token_ids=sampled.unsqueeze(-1),
        logprobs_tensors=None,
    )
    return sampler_output

gather_logprobs ¶

gather_logprobs(
    logprobs: Tensor, num_logprobs: int, token_ids: Tensor
) -> LogprobsTensors

Gather logprobs for topk and sampled/prompt token.

Parameters:

Name	Type	Description	Default
`logprobs`	`Tensor`	(num tokens) x (vocab) tensor	required
`num_logprobs`	`int`	minimum number of logprobs to retain per token	required
`token_ids`	`Tensor`	prompt tokens (if prompt logprobs) or sampled tokens (if sampled logprobs); 1D token ID tensor with (num tokens) elements	required

Returns:

Type	Description
`LogprobsTensors`	Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
`LogprobsTensors`	Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
`LogprobsTensors`	Sampled token rank tensor, (num tokens)

Source code in vllm/v1/sample/tpu/sampler.py

def gather_logprobs(
    self,
    logprobs: torch.Tensor,
    num_logprobs: int,
    token_ids: torch.Tensor,
) -> LogprobsTensors:
    """
    Gather logprobs for topk and sampled/prompt token.

    Args:
      logprobs: (num tokens) x (vocab) tensor
      num_logprobs: minimum number of logprobs to
                    retain per token
      token_ids: prompt tokens (if prompt logprobs)
                 or sampled tokens (if sampled
                 logprobs); 1D token ID tensor
                 with (num tokens) elements

    Returns:
      Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
      Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
      Sampled token rank tensor, (num tokens)
    """
    # Find the topK values.
    topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)

    # Get with the logprob of the prompt or sampled token.
    token_ids = token_ids.unsqueeze(-1)
    token_logprobs = logprobs.gather(-1, token_ids)

    # Compute the ranks of the actual token.
    token_ranks = (logprobs >= token_logprobs).sum(-1)

    # Concatenate together with the topk.
    indices = torch.cat((token_ids, topk_indices), dim=1)
    logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)

    # Use int32 to reduce the tensor size.
    indices = indices.to(torch.int32)

    return LogprobsTensors(indices, logprobs, token_ranks)

greedy_sample ¶

greedy_sample(logits: Tensor) -> Tensor

Source code in vllm/v1/sample/tpu/sampler.py

def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
    return logits.argmax(dim=-1).view(-1)

random_sample ¶

random_sample(
    probs: Tensor, generators: dict[int, Generator]
) -> Tensor

Source code in vllm/v1/sample/tpu/sampler.py

def random_sample(
    self,
    probs: torch.Tensor,
    generators: dict[int, torch.Generator],
) -> torch.Tensor:
    q = torch.empty_like(probs)
    # NOTE(woosuk): To batch-process the requests without their own seeds,
    # which is the common case, we first assume that every request does
    # not have its own seed. Then, we overwrite the values for the requests
    # that have their own seeds.
    q.exponential_()
    if generators:
        for i, generator in generators.items():
            q[i].exponential_(generator=generator)
    return probs.div_(q).argmax(dim=-1).view(-1)

sample ¶

sample(
    logits: Tensor,
    sampling_metadata: TPUSupportedSamplingMetadata,
) -> Tensor

Source code in vllm/v1/sample/tpu/sampler.py

def sample(
    self,
    logits: torch.Tensor,
    sampling_metadata: TPUSupportedSamplingMetadata,
) -> torch.Tensor:
    greedy_sampled = self.greedy_sample(logits)

    assert sampling_metadata.temperature is not None

    # Apply temperature.
    logits = self.apply_temperature(
        logits, sampling_metadata.temperature, sampling_metadata.all_random
    )

    # Apply min_p.
    if sampling_metadata.min_p is not None:
        logits = self.apply_min_p(logits, sampling_metadata.min_p)

    # Apply top_k and/or top_p.
    logits = apply_top_k_top_p(
        logits,
        sampling_metadata.top_k,
        sampling_metadata.top_p,
    )

    # Random sample.
    probs = logits.softmax(dim=-1, dtype=torch.float32)
    random_sampled = self.random_sample(probs, sampling_metadata.generators)

    sampled = torch.where(
        sampling_metadata.temperature < _SAMPLING_EPS,
        greedy_sampled,
        random_sampled,
    )
    return sampled

apply_top_k_top_p ¶

apply_top_k_top_p(
    logits: Tensor, k: Tensor | None, p: Tensor | None
) -> Tensor

Apply top-k and top-p optimized for TPU.

This algorithm avoids using torch.scatter which is extremely slow on TPU. This is achieved by finding a "cut-off" element in the original logit, and after thresholding the logit using this cut-off, the remaining elements shall constitute the top-p set.

Note: in the case of tie (i.e. multipple cut-off elements present in the logit), all tie elements are included in the top-p set. In other words, this function does not break ties. Instead, these tie tokens have equal chance of being chosen during final sampling, so we can consider the tie being broken then.

Source code in vllm/v1/sample/tpu/sampler.py

def apply_top_k_top_p(
    logits: torch.Tensor,
    k: torch.Tensor | None,
    p: torch.Tensor | None,
) -> torch.Tensor:
    """
    Apply top-k and top-p optimized for TPU.

    This algorithm avoids using torch.scatter which is extremely slow on TPU.
    This is achieved by finding a "cut-off" element in the original logit, and
    after thresholding the logit using this cut-off, the remaining elements
    shall constitute the top-p set.

    Note: in the case of tie (i.e. multipple cut-off elements present in the
    logit), all tie elements are included in the top-p set. In other words,
    this function does not break ties. Instead, these tie tokens have equal
    chance of being chosen during final sampling, so we can consider the tie
    being broken then.
    """
    probs = logits.softmax(dim=-1)
    probs_sort, _ = probs.sort(dim=-1, descending=False)

    if k is not None:
        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
        top_k_count = top_k_count.unsqueeze(dim=1)
        top_k_cutoff = probs_sort.gather(-1, top_k_count)

        # Make sure the no top-k rows are no-op.
        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))

        elements_to_discard = probs < top_k_cutoff
        logits.masked_fill_(elements_to_discard, -float("inf"))

    if p is not None:
        cumprob = torch.cumsum(probs_sort, dim=-1)
        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
        top_p_mask[:, -1] = False  # at least one

        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
        top_p_cutoff = probs_sort.gather(-1, top_p_count)
        elements_to_discard = probs < top_p_cutoff
        logits.masked_fill_(elements_to_discard, -float("inf"))

    return logits

vllm.v1.sample.tpu.sampler ¶

_SAMPLING_EPS module-attribute ¶

Sampler ¶

__init__ ¶

apply_min_p ¶

apply_temperature ¶

compute_logprobs ¶

forward ¶

gather_logprobs ¶

greedy_sample ¶

random_sample ¶

sample ¶

apply_top_k_top_p ¶

_SAMPLING_EPS `module-attribute` ¶

init ¶