Skip to content

vllm.multimodal.audio

AudioMediaIO

Bases: MediaIO[tuple[NDArray, float]]

Source code in vllm/multimodal/audio.py
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):

    def __init__(self, **kwargs) -> None:
        super().__init__()

        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
        return librosa.load(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
        return self.load_bytes(base64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return librosa.load(filepath, sr=None)

    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
        audio, sr = media

        with BytesIO() as buffer:
            soundfile.write(buffer, audio, sr, format="WAV")
            data = buffer.getvalue()

        return base64.b64encode(data).decode('utf-8')

kwargs instance-attribute

kwargs = kwargs

__init__

__init__(**kwargs) -> None
Source code in vllm/multimodal/audio.py
def __init__(self, **kwargs) -> None:
    super().__init__()

    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.
    self.kwargs = kwargs

encode_base64

encode_base64(media: tuple[NDArray, float]) -> str
Source code in vllm/multimodal/audio.py
def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
    audio, sr = media

    with BytesIO() as buffer:
        soundfile.write(buffer, audio, sr, format="WAV")
        data = buffer.getvalue()

    return base64.b64encode(data).decode('utf-8')

load_base64

load_base64(
    media_type: str, data: str
) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_base64(
    self,
    media_type: str,
    data: str,
) -> tuple[npt.NDArray, float]:
    return self.load_bytes(base64.b64decode(data))

load_bytes

load_bytes(data: bytes) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
    return librosa.load(BytesIO(data), sr=None)

load_file

load_file(filepath: Path) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
    return librosa.load(filepath, sr=None)

AudioResampler

Resample audio data to a target sample rate.

Source code in vllm/multimodal/audio.py
class AudioResampler:
    """Resample audio data to a target sample rate."""

    def __init__(
        self,
        target_sr: Optional[float] = None,
        method: Literal["librosa", "scipy"] = "librosa",
    ):
        self.target_sr = target_sr
        self.method = method

    def resample(
        self,
        audio: npt.NDArray[np.floating],
        *,
        orig_sr: float,
    ) -> npt.NDArray[np.floating]:
        if self.target_sr is None:
            raise RuntimeError("Audio resampling is not supported when "
                               "`target_sr` is not provided")
        if self.method == "librosa":
            return resample_audio_librosa(audio,
                                          orig_sr=orig_sr,
                                          target_sr=self.target_sr)
        elif self.method == "scipy":
            return resample_audio_scipy(audio,
                                        orig_sr=orig_sr,
                                        target_sr=self.target_sr)
        else:
            raise ValueError(f"Invalid resampling method: {self.method}. "
                             "Supported methods are 'librosa' and 'scipy'.")

method instance-attribute

method = method

target_sr instance-attribute

target_sr = target_sr

__init__

__init__(
    target_sr: Optional[float] = None,
    method: Literal["librosa", "scipy"] = "librosa",
)
Source code in vllm/multimodal/audio.py
def __init__(
    self,
    target_sr: Optional[float] = None,
    method: Literal["librosa", "scipy"] = "librosa",
):
    self.target_sr = target_sr
    self.method = method

resample

resample(
    audio: NDArray[floating], *, orig_sr: float
) -> NDArray[floating]
Source code in vllm/multimodal/audio.py
def resample(
    self,
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
) -> npt.NDArray[np.floating]:
    if self.target_sr is None:
        raise RuntimeError("Audio resampling is not supported when "
                           "`target_sr` is not provided")
    if self.method == "librosa":
        return resample_audio_librosa(audio,
                                      orig_sr=orig_sr,
                                      target_sr=self.target_sr)
    elif self.method == "scipy":
        return resample_audio_scipy(audio,
                                    orig_sr=orig_sr,
                                    target_sr=self.target_sr)
    else:
        raise ValueError(f"Invalid resampling method: {self.method}. "
                         "Supported methods are 'librosa' and 'scipy'.")

resample_audio_librosa

resample_audio_librosa(
    audio: NDArray[floating],
    *,
    orig_sr: float,
    target_sr: float,
) -> NDArray[floating]
Source code in vllm/multimodal/audio.py
def resample_audio_librosa(
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
    target_sr: float,
) -> npt.NDArray[np.floating]:
    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

resample_audio_scipy

resample_audio_scipy(
    audio: NDArray[floating],
    *,
    orig_sr: float,
    target_sr: float,
)
Source code in vllm/multimodal/audio.py
def resample_audio_scipy(
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
    target_sr: float,
):
    # lazy import scipy.signal, otherwise it will crash doc build.
    import scipy.signal

    if orig_sr > target_sr:
        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
    elif orig_sr < target_sr:
        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
    return audio