Skip to content

vllm.model_executor.models.qwen2_audio

Inference-only Qwen2-Audio model compatible with HuggingFace weights.

Qwen2AudioDummyInputsBuilder

Bases: BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]

Source code in vllm/model_executor/models/qwen2_audio.py
class Qwen2AudioDummyInputsBuilder(
        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_audios = mm_counts.get("audio", 0)

        hf_processor = self.info.get_hf_processor()
        audio_token = hf_processor.audio_token

        return audio_token * num_audios

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        feature_extractor = self.info.get_feature_extractor()

        sampling_rate = feature_extractor.sampling_rate
        audio_len = feature_extractor.chunk_length * sampling_rate
        num_audios = mm_counts.get("audio", 0)

        return {
            "audio":
            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
        }

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict
Source code in vllm/model_executor/models/qwen2_audio.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    feature_extractor = self.info.get_feature_extractor()

    sampling_rate = feature_extractor.sampling_rate
    audio_len = feature_extractor.chunk_length * sampling_rate
    num_audios = mm_counts.get("audio", 0)

    return {
        "audio":
        self._get_dummy_audios(length=audio_len, num_audios=num_audios)
    }

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/qwen2_audio.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_audios = mm_counts.get("audio", 0)

    hf_processor = self.info.get_hf_processor()
    audio_token = hf_processor.audio_token

    return audio_token * num_audios

Qwen2AudioForConditionalGeneration

Bases: Module, SupportsMultiModal, SupportsPP

Source code in vllm/model_executor/models/qwen2_audio.py
@MULTIMODAL_REGISTRY.register_processor(
    Qwen2AudioMultiModalProcessor,
    info=Qwen2AudioProcessingInfo,
    dummy_inputs=Qwen2AudioDummyInputsBuilder)
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("audio"):
            return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"

        raise ValueError("Only audio modality is supported")

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        multimodal_config = vllm_config.model_config.multimodal_config
        self.config = config
        self.multimodal_config = multimodal_config

        self.audio_tower = Qwen2AudioEncoder(config.audio_config)
        self.multi_modal_projector = Qwen2AudioMultiModalProjector(
            config.audio_config.d_model, config.text_config.hidden_size)

        self.quant_config = quant_config

        self.language_model = init_vllm_registered_model(
            vllm_config=vllm_config,
            hf_config=config.text_config,
            prefix=maybe_prefix(prefix, "language_model"),
            architectures=["Qwen2ForCausalLM"],
        )

        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors)

    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                        name: str) -> torch.Tensor:
        if not isinstance(mm_input, (torch.Tensor, list)):
            raise ValueError(f"Incorrect type of {name}. "
                             f"Got type: {type(mm_input)}")
        if isinstance(mm_input, torch.Tensor):
            return torch.concat(list(mm_input))
        else:
            return torch.concat(mm_input)

    def _parse_and_validate_audio_input(
            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
        input_features = kwargs.pop('input_features', None)
        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
        if input_features is None:
            return None
        input_features = self._validate_and_reshape_mm_tensor(
            input_features, 'input_features')
        feature_attention_mask = self._validate_and_reshape_mm_tensor(
            feature_attention_mask, 'feature_attention_mask')
        if not isinstance(input_features, (torch.Tensor, list)):
            raise ValueError("Incorrect type of audio input features. "
                             f"Got type: {type(input_features)}")
        return Qwen2AudioInputs(input_features=input_features,
                                feature_attention_mask=feature_attention_mask)

    def _process_audio_input(self,
                             audio_input: Qwen2AudioInputs) -> torch.Tensor:

        input_features = audio_input["input_features"]
        feature_attention_mask = audio_input["feature_attention_mask"]

        audio_feat_lengths, audio_output_lengths = (
            self.audio_tower._get_feat_extract_output_lengths(
                feature_attention_mask.sum(-1)))

        batch_size, _, max_mel_seq_len = input_features.shape
        max_seq_len = (max_mel_seq_len - 2) // 2 + 1
        # Create a sequence tensor of shape (batch_size, max_seq_len)
        seq_range = (torch.arange(
            0,
            max_seq_len,
            dtype=audio_feat_lengths.dtype,
            device=audio_feat_lengths.device).unsqueeze(0).expand(
                batch_size, max_seq_len))
        lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
            batch_size, max_seq_len)
        # Create mask
        padding_mask = seq_range >= lengths_expand

        audio_attention_mask_ = padding_mask.view(
            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
                                                  max_seq_len)
        audio_attention_mask = audio_attention_mask_.to(
            dtype=self.audio_tower.conv1.weight.dtype,
            device=self.audio_tower.conv1.weight.device)
        audio_attention_mask[audio_attention_mask_] = float("-inf")

        audio_outputs = self.audio_tower(input_features,
                                         attention_mask=audio_attention_mask)
        selected_audio_feature = audio_outputs.last_hidden_state
        audio_features = self.multi_modal_projector(selected_audio_feature)
        num_audios, max_audio_tokens, embed_dim = audio_features.shape
        audio_output_lengths = audio_output_lengths.unsqueeze(1)
        audio_features_mask = torch.arange(max_audio_tokens).expand(
            num_audios, max_audio_tokens).to(
                audio_output_lengths.device) < audio_output_lengths
        masked_audio_features = audio_features[audio_features_mask].view(
            -1, embed_dim)

        # Split to tuple of embeddings for individual audio input.
        return torch.split(masked_audio_features,
                           audio_output_lengths.flatten().tolist())

    def get_language_model(self) -> torch.nn.Module:
        return self.language_model

    def get_multimodal_embeddings(self,
                                  **kwargs: object) -> MultiModalEmbeddings:
        audio_input = self._parse_and_validate_audio_input(**kwargs)
        if audio_input is None:
            return []
        masked_audio_features = self._process_audio_input(audio_input)
        return masked_audio_features

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.config.audio_token_index)
        return inputs_embeds

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> Union[torch.Tensor, IntermediateTensors]:

        if intermediate_tensors is not None:
            inputs_embeds = None

        # NOTE: In v1, inputs_embeds is always generated at model runner, this
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
            inputs_embeds = self.get_input_embeddings(input_ids,
                                                      multimodal_embeddings)
            input_ids = None

        hidden_states = self.language_model.model(input_ids,
                                                  positions,
                                                  intermediate_tensors,
                                                  inputs_embeds=inputs_embeds)
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights)

audio_tower instance-attribute

audio_tower = Qwen2AudioEncoder(audio_config)

config instance-attribute

config = config

language_model instance-attribute

language_model = init_vllm_registered_model(
    vllm_config=vllm_config,
    hf_config=text_config,
    prefix=maybe_prefix(prefix, "language_model"),
    architectures=["Qwen2ForCausalLM"],
)

make_empty_intermediate_tensors instance-attribute

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

multi_modal_projector instance-attribute

multi_modal_projector = Qwen2AudioMultiModalProjector(
    d_model, hidden_size
)

multimodal_config instance-attribute

multimodal_config = multimodal_config

quant_config instance-attribute

quant_config = quant_config

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/qwen2_audio.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config = vllm_config.model_config.hf_config
    quant_config = vllm_config.quant_config
    multimodal_config = vllm_config.model_config.multimodal_config
    self.config = config
    self.multimodal_config = multimodal_config

    self.audio_tower = Qwen2AudioEncoder(config.audio_config)
    self.multi_modal_projector = Qwen2AudioMultiModalProjector(
        config.audio_config.d_model, config.text_config.hidden_size)

    self.quant_config = quant_config

    self.language_model = init_vllm_registered_model(
        vllm_config=vllm_config,
        hf_config=config.text_config,
        prefix=maybe_prefix(prefix, "language_model"),
        architectures=["Qwen2ForCausalLM"],
    )

    self.make_empty_intermediate_tensors = (
        self.language_model.make_empty_intermediate_tensors)

_parse_and_validate_audio_input

_parse_and_validate_audio_input(
    **kwargs: object,
) -> Optional[Qwen2AudioInputs]
Source code in vllm/model_executor/models/qwen2_audio.py
def _parse_and_validate_audio_input(
        self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
    input_features = kwargs.pop('input_features', None)
    feature_attention_mask = kwargs.pop('feature_attention_mask', None)
    if input_features is None:
        return None
    input_features = self._validate_and_reshape_mm_tensor(
        input_features, 'input_features')
    feature_attention_mask = self._validate_and_reshape_mm_tensor(
        feature_attention_mask, 'feature_attention_mask')
    if not isinstance(input_features, (torch.Tensor, list)):
        raise ValueError("Incorrect type of audio input features. "
                         f"Got type: {type(input_features)}")
    return Qwen2AudioInputs(input_features=input_features,
                            feature_attention_mask=feature_attention_mask)

_process_audio_input

_process_audio_input(
    audio_input: Qwen2AudioInputs,
) -> Tensor
Source code in vllm/model_executor/models/qwen2_audio.py
def _process_audio_input(self,
                         audio_input: Qwen2AudioInputs) -> torch.Tensor:

    input_features = audio_input["input_features"]
    feature_attention_mask = audio_input["feature_attention_mask"]

    audio_feat_lengths, audio_output_lengths = (
        self.audio_tower._get_feat_extract_output_lengths(
            feature_attention_mask.sum(-1)))

    batch_size, _, max_mel_seq_len = input_features.shape
    max_seq_len = (max_mel_seq_len - 2) // 2 + 1
    # Create a sequence tensor of shape (batch_size, max_seq_len)
    seq_range = (torch.arange(
        0,
        max_seq_len,
        dtype=audio_feat_lengths.dtype,
        device=audio_feat_lengths.device).unsqueeze(0).expand(
            batch_size, max_seq_len))
    lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
        batch_size, max_seq_len)
    # Create mask
    padding_mask = seq_range >= lengths_expand

    audio_attention_mask_ = padding_mask.view(
        batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
                                              max_seq_len)
    audio_attention_mask = audio_attention_mask_.to(
        dtype=self.audio_tower.conv1.weight.dtype,
        device=self.audio_tower.conv1.weight.device)
    audio_attention_mask[audio_attention_mask_] = float("-inf")

    audio_outputs = self.audio_tower(input_features,
                                     attention_mask=audio_attention_mask)
    selected_audio_feature = audio_outputs.last_hidden_state
    audio_features = self.multi_modal_projector(selected_audio_feature)
    num_audios, max_audio_tokens, embed_dim = audio_features.shape
    audio_output_lengths = audio_output_lengths.unsqueeze(1)
    audio_features_mask = torch.arange(max_audio_tokens).expand(
        num_audios, max_audio_tokens).to(
            audio_output_lengths.device) < audio_output_lengths
    masked_audio_features = audio_features[audio_features_mask].view(
        -1, embed_dim)

    # Split to tuple of embeddings for individual audio input.
    return torch.split(masked_audio_features,
                       audio_output_lengths.flatten().tolist())

_validate_and_reshape_mm_tensor

_validate_and_reshape_mm_tensor(
    mm_input: object, name: str
) -> Tensor
Source code in vllm/model_executor/models/qwen2_audio.py
def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                    name: str) -> torch.Tensor:
    if not isinstance(mm_input, (torch.Tensor, list)):
        raise ValueError(f"Incorrect type of {name}. "
                         f"Got type: {type(mm_input)}")
    if isinstance(mm_input, torch.Tensor):
        return torch.concat(list(mm_input))
    else:
        return torch.concat(mm_input)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/qwen2_audio.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    return self.language_model.compute_logits(hidden_states,
                                              sampling_metadata)

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs: object,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/qwen2_audio.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:

    if intermediate_tensors is not None:
        inputs_embeds = None

    # NOTE: In v1, inputs_embeds is always generated at model runner, this
    # condition is for v0 compatibility.
    elif inputs_embeds is None:
        multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
        inputs_embeds = self.get_input_embeddings(input_ids,
                                                  multimodal_embeddings)
        input_ids = None

    hidden_states = self.language_model.model(input_ids,
                                              positions,
                                              intermediate_tensors,
                                              inputs_embeds=inputs_embeds)
    return hidden_states

get_input_embeddings

get_input_embeddings(
    input_ids: Tensor,
    multimodal_embeddings: Optional[
        MultiModalEmbeddings
    ] = None,
) -> Tensor
Source code in vllm/model_executor/models/qwen2_audio.py
def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
    inputs_embeds = self.language_model.get_input_embeddings(input_ids)
    if multimodal_embeddings is not None \
        and len(multimodal_embeddings) != 0:
        inputs_embeds = merge_multimodal_embeddings(
            input_ids, inputs_embeds, multimodal_embeddings,
            self.config.audio_token_index)
    return inputs_embeds

get_language_model

get_language_model() -> Module
Source code in vllm/model_executor/models/qwen2_audio.py
def get_language_model(self) -> torch.nn.Module:
    return self.language_model

get_multimodal_embeddings

get_multimodal_embeddings(
    **kwargs: object,
) -> MultiModalEmbeddings
Source code in vllm/model_executor/models/qwen2_audio.py
def get_multimodal_embeddings(self,
                              **kwargs: object) -> MultiModalEmbeddings:
    audio_input = self._parse_and_validate_audio_input(**kwargs)
    if audio_input is None:
        return []
    masked_audio_features = self._process_audio_input(audio_input)
    return masked_audio_features

get_placeholder_str classmethod

get_placeholder_str(modality: str, i: int) -> Optional[str]
Source code in vllm/model_executor/models/qwen2_audio.py
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
    if modality.startswith("audio"):
        return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"

    raise ValueError("Only audio modality is supported")

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/qwen2_audio.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    loader = AutoWeightsLoader(self)
    return loader.load_weights(weights)

Qwen2AudioInputs

Bases: TypedDict

Source code in vllm/model_executor/models/qwen2_audio.py
class Qwen2AudioInputs(TypedDict):
    input_features: torch.Tensor
    """Shape: `(num_audios, num_mel_bins, 3000)`"""

    feature_attention_mask: torch.Tensor
    """Shape: `(num_audios, 3000)`"""

feature_attention_mask instance-attribute

feature_attention_mask: Tensor

Shape: (num_audios, 3000)

input_features instance-attribute

input_features: Tensor

Shape: (num_audios, num_mel_bins, 3000)

Qwen2AudioMultiModalProcessor

Bases: BaseMultiModalProcessor[Qwen2AudioProcessingInfo]

Source code in vllm/model_executor/models/qwen2_audio.py
class Qwen2AudioMultiModalProcessor(
        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):

    def _get_data_parser(self) -> MultiModalDataParser:
        feature_extractor = self.info.get_feature_extractor()
        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)

    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, Any],
        tok_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        # NOTE - we rename audios -> audio in mm data because transformers has
        # deprecated audios for the qwen2audio processor and will remove
        # support for it in transformers 4.54.
        audios = mm_data.pop("audios", [])
        if audios:
            mm_data["audio"] = audios

        # Text-only input not supported in composite processor
        if not mm_data.get("audio", []):
            prompt_ids = self.info.get_tokenizer().encode(prompt)
            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
        mm_kwargs = dict(
            **mm_kwargs,
            sampling_rate=feature_extractor.sampling_rate,
        )

        return super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
            tok_kwargs=tok_kwargs,
        )

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(
            input_features=MultiModalFieldConfig.batched("audio"),
            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
        )

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        tokenizer = self.info.get_tokenizer()
        vocab = tokenizer.get_vocab()

        # Use getattr with default to be compatible with transformers<4.48
        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
        audio_bos_token = getattr(processor, "audio_bos_token",
                                  "<|audio_bos|>")
        audio_eos_token = getattr(processor, "audio_eos_token",
                                  "<|audio_eos|>")

        audio_token_id = vocab[audio_token]
        audio_bos_id = vocab[audio_bos_token]
        audio_eos_id = vocab[audio_eos_token]

        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
        if feature_attention_mask is None:
            audio_output_lengths = []
        else:
            assert isinstance(feature_attention_mask, torch.Tensor)
            _, audio_output_lens = _get_feat_extract_output_lengths(
                feature_attention_mask.sum(-1))

            audio_output_lengths = audio_output_lens.tolist()

        def get_replacement_qwen2_audio(item_idx: int):
            num_features = audio_output_lengths[item_idx]
            if num_features == 0:
                audios = mm_items.get_items("audio", AudioProcessorItems)
                audio_len = audios.get_audio_length(item_idx)

                raise ValueError(f"The audio (len={audio_len}) is too short "
                                 "to be represented inside the model")

            audio_tokens = [audio_token_id] * num_features

            return PromptUpdateDetails.select_token_id(
                [audio_bos_id] + audio_tokens + [audio_eos_id],
                embed_token_id=audio_token_id,
            )

        return [
            PromptReplacement(
                modality="audio",
                target=audio_token,
                replacement=get_replacement_qwen2_audio,
            )
        ]

_call_hf_processor

_call_hf_processor(
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, Any],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature
Source code in vllm/model_executor/models/qwen2_audio.py
def _call_hf_processor(
    self,
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, Any],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature:
    # NOTE - we rename audios -> audio in mm data because transformers has
    # deprecated audios for the qwen2audio processor and will remove
    # support for it in transformers 4.54.
    audios = mm_data.pop("audios", [])
    if audios:
        mm_data["audio"] = audios

    # Text-only input not supported in composite processor
    if not mm_data.get("audio", []):
        prompt_ids = self.info.get_tokenizer().encode(prompt)
        prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
        return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

    feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
    mm_kwargs = dict(
        **mm_kwargs,
        sampling_rate=feature_extractor.sampling_rate,
    )

    return super()._call_hf_processor(
        prompt=prompt,
        mm_data=mm_data,
        mm_kwargs=mm_kwargs,
        tok_kwargs=tok_kwargs,
    )

_get_data_parser

_get_data_parser() -> MultiModalDataParser
Source code in vllm/model_executor/models/qwen2_audio.py
def _get_data_parser(self) -> MultiModalDataParser:
    feature_extractor = self.info.get_feature_extractor()
    return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]
Source code in vllm/model_executor/models/qwen2_audio.py
def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    return dict(
        input_features=MultiModalFieldConfig.batched("audio"),
        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
    )

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]
Source code in vllm/model_executor/models/qwen2_audio.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
    processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
    tokenizer = self.info.get_tokenizer()
    vocab = tokenizer.get_vocab()

    # Use getattr with default to be compatible with transformers<4.48
    audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
    audio_bos_token = getattr(processor, "audio_bos_token",
                              "<|audio_bos|>")
    audio_eos_token = getattr(processor, "audio_eos_token",
                              "<|audio_eos|>")

    audio_token_id = vocab[audio_token]
    audio_bos_id = vocab[audio_bos_token]
    audio_eos_id = vocab[audio_eos_token]

    feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
    if feature_attention_mask is None:
        audio_output_lengths = []
    else:
        assert isinstance(feature_attention_mask, torch.Tensor)
        _, audio_output_lens = _get_feat_extract_output_lengths(
            feature_attention_mask.sum(-1))

        audio_output_lengths = audio_output_lens.tolist()

    def get_replacement_qwen2_audio(item_idx: int):
        num_features = audio_output_lengths[item_idx]
        if num_features == 0:
            audios = mm_items.get_items("audio", AudioProcessorItems)
            audio_len = audios.get_audio_length(item_idx)

            raise ValueError(f"The audio (len={audio_len}) is too short "
                             "to be represented inside the model")

        audio_tokens = [audio_token_id] * num_features

        return PromptUpdateDetails.select_token_id(
            [audio_bos_id] + audio_tokens + [audio_eos_id],
            embed_token_id=audio_token_id,
        )

    return [
        PromptReplacement(
            modality="audio",
            target=audio_token,
            replacement=get_replacement_qwen2_audio,
        )
    ]

Qwen2AudioMultiModalProjector

Bases: Module

Source code in vllm/model_executor/models/qwen2_audio.py
class Qwen2AudioMultiModalProjector(nn.Module):

    def __init__(self, audio_hidden_size: int, text_hidden_size: int):
        super().__init__()
        self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)

    def forward(self, audio_features):
        hidden_states = self.linear(audio_features)
        return hidden_states

linear instance-attribute

linear = Linear(
    audio_hidden_size, text_hidden_size, bias=True
)

__init__

__init__(audio_hidden_size: int, text_hidden_size: int)
Source code in vllm/model_executor/models/qwen2_audio.py
def __init__(self, audio_hidden_size: int, text_hidden_size: int):
    super().__init__()
    self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)

forward

forward(audio_features)
Source code in vllm/model_executor/models/qwen2_audio.py
def forward(self, audio_features):
    hidden_states = self.linear(audio_features)
    return hidden_states

Qwen2AudioProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/qwen2_audio.py
class Qwen2AudioProcessingInfo(BaseProcessingInfo):

    def get_hf_config(self):
        return self.ctx.get_hf_config(Qwen2AudioConfig)

    def get_hf_processor(
        self,
        *,
        # Ignored in initialization
        sampling_rate: Optional[int] = None,
        **kwargs: object,
    ) -> Qwen2AudioProcessor:
        return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)

    def get_feature_extractor(
        self,
        *,
        # Ignored in initialization
        sampling_rate: Optional[int] = None,
    ) -> WhisperFeatureExtractor:
        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
        feature_extractor = hf_processor.feature_extractor  # type: ignore
        assert isinstance(feature_extractor, WhisperFeatureExtractor)
        return feature_extractor

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None}

get_feature_extractor

get_feature_extractor(
    *, sampling_rate: Optional[int] = None
) -> WhisperFeatureExtractor
Source code in vllm/model_executor/models/qwen2_audio.py
def get_feature_extractor(
    self,
    *,
    # Ignored in initialization
    sampling_rate: Optional[int] = None,
) -> WhisperFeatureExtractor:
    hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
    feature_extractor = hf_processor.feature_extractor  # type: ignore
    assert isinstance(feature_extractor, WhisperFeatureExtractor)
    return feature_extractor

get_hf_config

get_hf_config()
Source code in vllm/model_executor/models/qwen2_audio.py
def get_hf_config(self):
    return self.ctx.get_hf_config(Qwen2AudioConfig)

get_hf_processor

get_hf_processor(
    *, sampling_rate: Optional[int] = None, **kwargs: object
) -> Qwen2AudioProcessor
Source code in vllm/model_executor/models/qwen2_audio.py
def get_hf_processor(
    self,
    *,
    # Ignored in initialization
    sampling_rate: Optional[int] = None,
    **kwargs: object,
) -> Qwen2AudioProcessor:
    return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)

get_supported_mm_limits

get_supported_mm_limits() -> Mapping[str, Optional[int]]
Source code in vllm/model_executor/models/qwen2_audio.py
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"audio": None}

_get_feat_extract_output_lengths

_get_feat_extract_output_lengths(input_lengths: Tensor)
Source code in vllm/model_executor/models/qwen2_audio.py
def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
    feat_lengths = (input_lengths - 1) // 2 + 1
    output_lengths = (feat_lengths - 2) // 2 + 1
    return feat_lengths, output_lengths