Skip to content

vllm.lora.punica_wrapper.punica_hpu

PunicaWrapperHPU

Bases: PunicaWrapperBase

Source code in vllm/lora/punica_wrapper/punica_hpu.py
@final
class PunicaWrapperHPU(PunicaWrapperBase):

    def __init__(self, max_num_batched_tokens: int, max_batches: int,
                 device: Union[torch.device, str], **kwargs):
        # Increasing max_num_batched_tokens by 3x to handle increase in
        # tensor size due to padding.
        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
                                   max_batches, device)

    def _update_base_metadata(
        self,
        mapping: "LoRAMapping",
        lora_index_to_id: list[Optional[int]],
        max_loras: int,
        vocab_size: int,
        extra_vocab_size: int,
        long_lora_context: Optional["LongContextLoRAContext"] = None,
    ):
        (
            base_indices,
            sampler_indices,
            sampler_indices_padded,
            embeddings_indices,
            long_lora_offsets_tensor,
            indices_len,
        ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
                            extra_vocab_size, self.device, None)
        # Updating each element in `long_lora_offsets` with `lora_offset` slows
        # down perf in HPU due to a series of `strided_insert` ops during lazy
        # graph accumulation. Hence HPU appends `lora_offset` to a list and
        # converts it to a tensor only after it is ready.
        if long_lora_context:
            index_mapping_indices: list[int] = list(
                mapping.index_mapping).copy()
            long_lora_offsets: list[int] = []
            for i in range(len(index_mapping_indices)):
                lora_offset: int = long_lora_context.offsets_by_lora_id.get(
                    index_mapping_indices[i], 0)
                long_lora_offsets.append(lora_offset)
            long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
                                                    device=self.device,
                                                    dtype=torch.long)
            indices_len[-1] = long_lora_offsets_tensor.shape[-1]

        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
            sampler_indices_padded)
        self._embeddings_indices[:embeddings_indices.
                                 shape[0], :embeddings_indices.shape[1]].copy_(
                                     embeddings_indices)
        if long_lora_offsets_tensor is not None:
            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
                long_lora_offsets_tensor)
        else:
            self._long_lora_indices.zero_()
        self.indices_len[:] = indices_len

    def add_lora_embedding(self,
                           y: torch.Tensor,
                           x: torch.Tensor,
                           lora_b_stacked: torch.Tensor,
                           add_inputs: bool = True,
                           **kwargs) -> None:
        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)

    def add_lora_linear(self,
                        y: torch.Tensor,
                        x: torch.Tensor,
                        lora_a_stacked: tuple[torch.Tensor, ...],
                        lora_b_stacked: tuple[torch.Tensor, ...],
                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
                        scale: float,
                        output_slices: tuple[int, ...],
                        *,
                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
                        **kwargs) -> None:
        y_org = y
        x = x.view(-1, x.shape[-1])
        y = y.view(-1, y.shape[-1])
        offset_left = 0

        for slice_idx in range(len(output_slices)):
            dispatch_bgmv_linear(
                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
            offset_left += output_slices[slice_idx]
        y = y.view_as(y_org)

    def add_lora_logits(self,
                        y: torch.Tensor,
                        x: torch.Tensor,
                        lora_a_stacked: torch.Tensor,
                        lora_b_stacked: torch.Tensor,
                        scale,
                        *,
                        buffer: Optional[torch.Tensor] = None,
                        **kwargs) -> None:
        y_org = y
        y = y.view(-1, y.shape[-1])
        x = x.view(-1, x.shape[-1])
        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
        y = y.view_as(y_org)

    def add_shrink(
        self,
        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
        x: torch.Tensor,
        lora_a_stacked: tuple[torch.Tensor, ...],
        scale: float,
        **kwargs,
    ) -> None:
        raise NotImplementedError

    def add_expand(
        self,
        y: torch.Tensor,
        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
        lora_b_stacked: tuple[torch.Tensor, ...],
        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
        output_slices: tuple[int, ...],
        offset_start: int = 0,
        add_inputs=True,
        **kwargs,
    ) -> None:
        raise NotImplementedError

__init__

__init__(
    max_num_batched_tokens: int,
    max_batches: int,
    device: Union[device, str],
    **kwargs,
)
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def __init__(self, max_num_batched_tokens: int, max_batches: int,
             device: Union[torch.device, str], **kwargs):
    # Increasing max_num_batched_tokens by 3x to handle increase in
    # tensor size due to padding.
    PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
                               max_batches, device)

_update_base_metadata

_update_base_metadata(
    mapping: LoRAMapping,
    lora_index_to_id: list[Optional[int]],
    max_loras: int,
    vocab_size: int,
    extra_vocab_size: int,
    long_lora_context: Optional[
        LongContextLoRAContext
    ] = None,
)
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def _update_base_metadata(
    self,
    mapping: "LoRAMapping",
    lora_index_to_id: list[Optional[int]],
    max_loras: int,
    vocab_size: int,
    extra_vocab_size: int,
    long_lora_context: Optional["LongContextLoRAContext"] = None,
):
    (
        base_indices,
        sampler_indices,
        sampler_indices_padded,
        embeddings_indices,
        long_lora_offsets_tensor,
        indices_len,
    ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
                        extra_vocab_size, self.device, None)
    # Updating each element in `long_lora_offsets` with `lora_offset` slows
    # down perf in HPU due to a series of `strided_insert` ops during lazy
    # graph accumulation. Hence HPU appends `lora_offset` to a list and
    # converts it to a tensor only after it is ready.
    if long_lora_context:
        index_mapping_indices: list[int] = list(
            mapping.index_mapping).copy()
        long_lora_offsets: list[int] = []
        for i in range(len(index_mapping_indices)):
            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
                index_mapping_indices[i], 0)
            long_lora_offsets.append(lora_offset)
        long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
                                                device=self.device,
                                                dtype=torch.long)
        indices_len[-1] = long_lora_offsets_tensor.shape[-1]

    self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
    self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
    self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
        sampler_indices_padded)
    self._embeddings_indices[:embeddings_indices.
                             shape[0], :embeddings_indices.shape[1]].copy_(
                                 embeddings_indices)
    if long_lora_offsets_tensor is not None:
        self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
            long_lora_offsets_tensor)
    else:
        self._long_lora_indices.zero_()
    self.indices_len[:] = indices_len

add_expand

add_expand(
    y: Tensor,
    x: Union[tuple[Tensor, ...], Tensor],
    lora_b_stacked: tuple[Tensor, ...],
    lora_bias_stacked: Optional[tuple[Tensor, ...]],
    output_slices: tuple[int, ...],
    offset_start: int = 0,
    add_inputs=True,
    **kwargs,
) -> None
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def add_expand(
    self,
    y: torch.Tensor,
    x: Union[tuple[torch.Tensor, ...], torch.Tensor],
    lora_b_stacked: tuple[torch.Tensor, ...],
    lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
    output_slices: tuple[int, ...],
    offset_start: int = 0,
    add_inputs=True,
    **kwargs,
) -> None:
    raise NotImplementedError

add_lora_embedding

add_lora_embedding(
    y: Tensor,
    x: Tensor,
    lora_b_stacked: Tensor,
    add_inputs: bool = True,
    **kwargs,
) -> None
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def add_lora_embedding(self,
                       y: torch.Tensor,
                       x: torch.Tensor,
                       lora_b_stacked: torch.Tensor,
                       add_inputs: bool = True,
                       **kwargs) -> None:
    dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)

add_lora_linear

add_lora_linear(
    y: Tensor,
    x: Tensor,
    lora_a_stacked: tuple[Tensor, ...],
    lora_b_stacked: tuple[Tensor, ...],
    lora_bias_stacked: Optional[tuple[Tensor, ...]],
    scale: float,
    output_slices: tuple[int, ...],
    *,
    buffer: Optional[tuple[Tensor, ...]] = None,
    **kwargs,
) -> None
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def add_lora_linear(self,
                    y: torch.Tensor,
                    x: torch.Tensor,
                    lora_a_stacked: tuple[torch.Tensor, ...],
                    lora_b_stacked: tuple[torch.Tensor, ...],
                    lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
                    scale: float,
                    output_slices: tuple[int, ...],
                    *,
                    buffer: Optional[tuple[torch.Tensor, ...]] = None,
                    **kwargs) -> None:
    y_org = y
    x = x.view(-1, x.shape[-1])
    y = y.view(-1, y.shape[-1])
    offset_left = 0

    for slice_idx in range(len(output_slices)):
        dispatch_bgmv_linear(
            y[:, offset_left:offset_left + output_slices[slice_idx]], x,
            lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
        offset_left += output_slices[slice_idx]
    y = y.view_as(y_org)

add_lora_logits

add_lora_logits(
    y: Tensor,
    x: Tensor,
    lora_a_stacked: Tensor,
    lora_b_stacked: Tensor,
    scale,
    *,
    buffer: Optional[Tensor] = None,
    **kwargs,
) -> None
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def add_lora_logits(self,
                    y: torch.Tensor,
                    x: torch.Tensor,
                    lora_a_stacked: torch.Tensor,
                    lora_b_stacked: torch.Tensor,
                    scale,
                    *,
                    buffer: Optional[torch.Tensor] = None,
                    **kwargs) -> None:
    y_org = y
    y = y.view(-1, y.shape[-1])
    x = x.view(-1, x.shape[-1])
    dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
    y = y.view_as(y_org)

add_shrink

add_shrink(
    y: Union[tuple[Tensor, ...], Tensor],
    x: Tensor,
    lora_a_stacked: tuple[Tensor, ...],
    scale: float,
    **kwargs,
) -> None
Source code in vllm/lora/punica_wrapper/punica_hpu.py
def add_shrink(
    self,
    y: Union[tuple[torch.Tensor, ...], torch.Tensor],
    x: torch.Tensor,
    lora_a_stacked: tuple[torch.Tensor, ...],
    scale: float,
    **kwargs,
) -> None:
    raise NotImplementedError